diff --git a/Llama-3.2-3B-Instruct_chunk1.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a0f88447ea8d18392fe521e05f7f0cdbe07326e7
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0806d9561f1b977f8fb7c990502de9bc2576ac170b096b4b3479ca05c69b5db9
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk1.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6c1e56794a0b19a2e171e365e299a5a5f3b1c56a
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7d9258e6a9d7d75508e04144cce26719dee2fd20b0953014c351af2d53f0f6a
+size 409
diff --git a/Llama-3.2-3B-Instruct_chunk1.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5064b6f7ebc3ec2b533b2a66b67a7756f5a8cc71
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/metadata.json
@@ -0,0 +1,105 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Select" : 2,
+      "Tile" : 2,
+      "Ios16.sub" : 3,
+      "Transpose" : 2,
+      "Ios16.gather" : 3,
+      "ExpandDims" : 3,
+      "Ios16.reshape" : 1,
+      "Ios16.maximum" : 1,
+      "Ios16.less" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 64]",
+        "name" : "input_ids",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "full_sequence_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk1",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk1.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..f01973a2b85322f7b32a41c3c9fa22862154752d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/model.mil
@@ -0,0 +1,50 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<int32, [1]> full_sequence_length, tensor<int32, [1, 64]> input_ids) {
+            tensor<int32, [1]> T = const()[name = tensor<string, []>("T"), val = tensor<int32, [1]>([64])];
+            tensor<int32, []> x_1_axis_0 = const()[name = tensor<string, []>("x_1_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> x_1_batch_dims_0 = const()[name = tensor<string, []>("x_1_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<fp16, [128256, 3072]> wte_weight_to_fp16 = const()[name = tensor<string, []>("wte_weight_to_fp16"), val = tensor<fp16, [128256, 3072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 64, 3072]> x_1_cast_fp16 = gather(axis = x_1_axis_0, batch_dims = x_1_batch_dims_0, indices = input_ids, x = wte_weight_to_fp16)[name = tensor<string, []>("x_1_cast_fp16")];
+            tensor<int32, [3]> x_perm_0 = const()[name = tensor<string, []>("x_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [4]> var_27 = const()[name = tensor<string, []>("op_27"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 64]> x_cast_fp16 = transpose(perm = x_perm_0, x = x_1_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 3072, 8, 8]> x = reshape(shape = var_27, x = x_cast_fp16)[name = tensor<string, []>("op_28_cast_fp16")];
+            tensor<int32, [1]> pos_offset = sub(x = T, y = full_sequence_length)[name = tensor<string, []>("pos_offset")];
+            tensor<int32, [64]> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, [64]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63])];
+            tensor<int32, [64]> input_pos_1 = sub(x = var_36, y = pos_offset)[name = tensor<string, []>("input_pos_1")];
+            tensor<int32, [64]> var_44 = const()[name = tensor<string, []>("op_44"), val = tensor<int32, [64]>([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])];
+            tensor<int32, [64]> input_pos = maximum(x = input_pos_1, y = var_44)[name = tensor<string, []>("input_pos")];
+            tensor<int32, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<int32, []>(1)];
+            tensor<int32, []> cos_batch_dims_0 = const()[name = tensor<string, []>("cos_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<fp16, [128, 512]> var_54_to_fp16 = const()[name = tensor<string, []>("op_54_to_fp16"), val = tensor<fp16, [128, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(788004992)))];
+            tensor<fp16, [128, 64]> cos = gather(axis = var_55, batch_dims = cos_batch_dims_0, indices = input_pos, x = var_54_to_fp16)[name = tensor<string, []>("cos_cast_fp16")];
+            tensor<int32, []> var_66 = const()[name = tensor<string, []>("op_66"), val = tensor<int32, []>(1)];
+            tensor<int32, []> sin_batch_dims_0 = const()[name = tensor<string, []>("sin_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<fp16, [128, 512]> var_65_to_fp16 = const()[name = tensor<string, []>("op_65_to_fp16"), val = tensor<fp16, [128, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(788136128)))];
+            tensor<fp16, [128, 64]> sin = gather(axis = var_66, batch_dims = sin_batch_dims_0, indices = input_pos, x = var_65_to_fp16)[name = tensor<string, []>("sin_cast_fp16")];
+            tensor<int32, [64, 1]> var_102 = const()[name = tensor<string, []>("op_102"), val = tensor<int32, [64, 1]>([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49], [50], [51], [52], [53], [54], [55], [56], [57], [58], [59], [60], [61], [62], [63]])];
+            tensor<bool, [64, 1]> var_105 = less(x = var_102, y = pos_offset)[name = tensor<string, []>("op_105")];
+            tensor<int32, [2]> var_105_after_broadcast_reps_0 = const()[name = tensor<string, []>("op_105_after_broadcast_reps_0"), val = tensor<int32, [2]>([1, 512])];
+            tensor<bool, [64, 512]> var_105_after_broadcast = tile(reps = var_105_after_broadcast_reps_0, x = var_105)[name = tensor<string, []>("op_105_after_broadcast")];
+            tensor<fp16, [64, 512]> all_mask_to_fp16 = const()[name = tensor<string, []>("all_mask_to_fp16"), val = tensor<fp16, [64, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(788267264)))];
+            tensor<fp16, [64, 512]> m_1_to_fp16 = const()[name = tensor<string, []>("m_1_to_fp16"), val = tensor<fp16, [64, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(788332864)))];
+            tensor<fp16, [64, 512]> m_3_cast_fp16 = select(a = all_mask_to_fp16, b = m_1_to_fp16, cond = var_105_after_broadcast)[name = tensor<string, []>("m_3_cast_fp16")];
+            tensor<int32, [512]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [512]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511])];
+            tensor<int32, []> var_116 = const()[name = tensor<string, []>("op_116"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_118 = sub(x = var_116, y = full_sequence_length)[name = tensor<string, []>("op_118")];
+            tensor<bool, [512]> var_119 = less(x = var_115, y = var_118)[name = tensor<string, []>("op_119")];
+            tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<bool, [1, 512]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = var_119)[name = tensor<string, []>("expand_dims_0")];
+            tensor<int32, [2]> var_119_after_broadcast_reps_0 = const()[name = tensor<string, []>("op_119_after_broadcast_reps_0"), val = tensor<int32, [2]>([64, 1])];
+            tensor<bool, [64, 512]> var_119_after_broadcast = tile(reps = var_119_after_broadcast_reps_0, x = expand_dims_0)[name = tensor<string, []>("op_119_after_broadcast")];
+            tensor<fp16, [64, 512]> m_cast_fp16 = select(a = all_mask_to_fp16, b = m_3_cast_fp16, cond = var_119_after_broadcast)[name = tensor<string, []>("m_cast_fp16")];
+            tensor<int32, [1]> var_122_axes_0 = const()[name = tensor<string, []>("op_122_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 64, 512]> var_122_cast_fp16 = expand_dims(axes = var_122_axes_0, x = m_cast_fp16)[name = tensor<string, []>("op_122_cast_fp16")];
+            tensor<int32, [1]> mask_axes_0 = const()[name = tensor<string, []>("mask_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 64, 512]> mask_cast_fp16 = expand_dims(axes = mask_axes_0, x = var_122_cast_fp16)[name = tensor<string, []>("mask_cast_fp16")];
+            tensor<int32, [4]> var_129 = const()[name = tensor<string, []>("op_129"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<fp16, [1, 512, 1, 64]> mask = transpose(perm = var_129, x = mask_cast_fp16)[name = tensor<string, []>("transpose_0")];
+        } -> (x, cos, sin, mask);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk1.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..234764e3927fd6e9e89f3588f6f71421c95a623d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk1.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6f15abcf5514d401e17446e479bffc8f51867d8bec5ad4b84751ed31b378192
+size 788398464
diff --git a/Llama-3.2-3B-Instruct_chunk10.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk10.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk10.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..4912583441b6a93c0eddb6ab0a90ff9e17e7c228
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk10",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk10.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk10.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..69e886b3711c8507e63bb0a32cb390c36f5a9777
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk10.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:943564e130d797200225f5eaeef339030c6c3c01691819963fe7ef78303c8545
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk11.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk11.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk11.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..45870338d8d760107fb047595a2394be13aca491
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk11",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk11.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk11.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d9c855b7b9002f9da80db355b0c7893d31cbd976
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk11.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d05d3ea64682e5a7113cac2eafb36ecb827788c42f94832abe42470a06e6bd90
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk12.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk12.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk12.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..0764699f180c1456e847c7b2e0f5b8c1083d8acc
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk12",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk12.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk12.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..19e4bcbb29ffc6423e122439ad9ab823c1e478e2
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk12.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b78ea2d679f5c65ebc0a051180df02861df9dfad72fdbd6d7da795e6effcbd4
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk13.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk13.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fed05170d981b8582c9421ec7550f748512caf2
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b45f96f9ba201e16f197a78412041f41d2ac869df9ad95ef03af7662e7d940
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk13.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..042152024e0a7d922ca0457b0fcb16c0a03410bb
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk13",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk13.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk13.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0303ebcb6e35d02a6ae9c5e453d020529c288a93
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk13.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c010e4d1d6fba27b10c2fc842fe9244994286e621309812a88c461ddeb071342
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk14.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk14.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk14.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae31a663181f3e6c6f748d526d461429f1134e53
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk14",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk14.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk14.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5f7055194b4048a816ee80c4e4a8e28c1386f4ea
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk14.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f154c6a8553b1baf60e07b7f98801702aa94b24c825db6651c5dea4d28f0c0b6
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk15.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk15.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk15.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..c90ee9167173ecd1abc42ad897a8e971d3f9c882
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk15",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk15.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk15.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9fb4cdb8f6ee656a85f4d01e3c9a1c1ca0e66302
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk15.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb9d6d8288946add36334ed30dca14912099e940d2f8989248bac042f9112d1
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk16.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cbeeb3500b890cb12054b285c60ccf293726d4b5
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:099d892d9754805b9d755f8a563efcf8322cc8319dee028d51b62ca558115cb5
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk16.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..baa88d80a77c4c03df2caf2dc6b2b21a213555e2
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:351176279ef47a4a1ba4a1c69135c7c59e5181f28a16fe3d47f04bd2a80c5863
+size 501
diff --git a/Llama-3.2-3B-Instruct_chunk16.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ad09b4845167243f7fc4b556669fb7d84f08c3b
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/metadata.json
@@ -0,0 +1,134 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_2",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_3",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_4",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_5",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_6",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 13568)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 13568]",
+        "name" : "logits_7",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 1,
+      "Ios16.mul" : 2,
+      "Squeeze" : 1,
+      "Transpose" : 1,
+      "Ios16.reshape" : 10,
+      "Ios16.matmul" : 8,
+      "Ios16.realDiv" : 1,
+      "Ios16.reduceL2Norm" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk16",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk16.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..42266367a5b8a44218c5b7d2e5cc7770217a6081
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/model.mil
@@ -0,0 +1,74 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [1, 3072, 8, 8]> x) {
+            tensor<bool, []> var_6 = const()[name = tensor<string, []>("op_6"), val = tensor<bool, []>(true)];
+            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_9, interleave = x_eps_interleave_0, values = (x, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_6, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_34_to_fp16 = const()[name = tensor<string, []>("op_34_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_34_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> ln_f_weight_to_fp16 = const()[name = tensor<string, []>("ln_f_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = ln_f_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_48 = const()[name = tensor<string, []>("op_48"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> x_cast_fp16 = reshape(shape = var_48, x = x_5_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
+            tensor<int32, [1]> var_51_axes_0 = const()[name = tensor<string, []>("op_51_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 3072, 64]> var_51_cast_fp16 = squeeze(axes = var_51_axes_0, x = x_cast_fp16)[name = tensor<string, []>("op_51_cast_fp16")];
+            tensor<int32, [3]> var_54_perm_0 = const()[name = tensor<string, []>("op_54_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [2]> concat_4 = const()[name = tensor<string, []>("concat_4"), val = tensor<int32, [2]>([64, 3072])];
+            tensor<fp16, [1, 64, 3072]> var_54_cast_fp16 = transpose(perm = var_54_perm_0, x = var_51_cast_fp16)[name = tensor<string, []>("transpose_16")];
+            tensor<fp16, [64, 3072]> reshape_0_cast_fp16 = reshape(shape = concat_4, x = var_54_cast_fp16)[name = tensor<string, []>("reshape_0_cast_fp16")];
+            tensor<bool, []> matmul_0_transpose_x_0 = const()[name = tensor<string, []>("matmul_0_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_0_transpose_y_0 = const()[name = tensor<string, []>("matmul_0_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_1_to_fp16 = const()[name = tensor<string, []>("transpose_1_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [64, 16384]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_1_to_fp16)[name = tensor<string, []>("matmul_0_cast_fp16")];
+            tensor<int32, [3]> concat_8 = const()[name = tensor<string, []>("concat_8"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_0 = reshape(shape = concat_8, x = matmul_0_cast_fp16)[name = tensor<string, []>("reshape_2_cast_fp16")];
+            tensor<bool, []> matmul_1_transpose_x_0 = const()[name = tensor<string, []>("matmul_1_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_1_transpose_y_0 = const()[name = tensor<string, []>("matmul_1_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_3_to_fp16 = const()[name = tensor<string, []>("transpose_3_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100669824)))];
+            tensor<fp16, [64, 16384]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_3_to_fp16)[name = tensor<string, []>("matmul_1_cast_fp16")];
+            tensor<int32, [3]> concat_16 = const()[name = tensor<string, []>("concat_16"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_1 = reshape(shape = concat_16, x = matmul_1_cast_fp16)[name = tensor<string, []>("reshape_5_cast_fp16")];
+            tensor<bool, []> matmul_2_transpose_x_0 = const()[name = tensor<string, []>("matmul_2_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_2_transpose_y_0 = const()[name = tensor<string, []>("matmul_2_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_5_to_fp16 = const()[name = tensor<string, []>("transpose_5_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201333184)))];
+            tensor<fp16, [64, 16384]> matmul_2_cast_fp16 = matmul(transpose_x = matmul_2_transpose_x_0, transpose_y = matmul_2_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_5_to_fp16)[name = tensor<string, []>("matmul_2_cast_fp16")];
+            tensor<int32, [3]> concat_24 = const()[name = tensor<string, []>("concat_24"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_2 = reshape(shape = concat_24, x = matmul_2_cast_fp16)[name = tensor<string, []>("reshape_8_cast_fp16")];
+            tensor<bool, []> matmul_3_transpose_x_0 = const()[name = tensor<string, []>("matmul_3_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_3_transpose_y_0 = const()[name = tensor<string, []>("matmul_3_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_7_to_fp16 = const()[name = tensor<string, []>("transpose_7_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(301996544)))];
+            tensor<fp16, [64, 16384]> matmul_3_cast_fp16 = matmul(transpose_x = matmul_3_transpose_x_0, transpose_y = matmul_3_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_7_to_fp16)[name = tensor<string, []>("matmul_3_cast_fp16")];
+            tensor<int32, [3]> concat_32 = const()[name = tensor<string, []>("concat_32"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_3 = reshape(shape = concat_32, x = matmul_3_cast_fp16)[name = tensor<string, []>("reshape_11_cast_fp16")];
+            tensor<bool, []> matmul_4_transpose_x_0 = const()[name = tensor<string, []>("matmul_4_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_4_transpose_y_0 = const()[name = tensor<string, []>("matmul_4_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_9_to_fp16 = const()[name = tensor<string, []>("transpose_9_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(402659904)))];
+            tensor<fp16, [64, 16384]> matmul_4_cast_fp16 = matmul(transpose_x = matmul_4_transpose_x_0, transpose_y = matmul_4_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_9_to_fp16)[name = tensor<string, []>("matmul_4_cast_fp16")];
+            tensor<int32, [3]> concat_40 = const()[name = tensor<string, []>("concat_40"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_4 = reshape(shape = concat_40, x = matmul_4_cast_fp16)[name = tensor<string, []>("reshape_14_cast_fp16")];
+            tensor<bool, []> matmul_5_transpose_x_0 = const()[name = tensor<string, []>("matmul_5_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_5_transpose_y_0 = const()[name = tensor<string, []>("matmul_5_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_11_to_fp16 = const()[name = tensor<string, []>("transpose_11_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503323264)))];
+            tensor<fp16, [64, 16384]> matmul_5_cast_fp16 = matmul(transpose_x = matmul_5_transpose_x_0, transpose_y = matmul_5_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_11_to_fp16)[name = tensor<string, []>("matmul_5_cast_fp16")];
+            tensor<int32, [3]> concat_48 = const()[name = tensor<string, []>("concat_48"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_5 = reshape(shape = concat_48, x = matmul_5_cast_fp16)[name = tensor<string, []>("reshape_17_cast_fp16")];
+            tensor<bool, []> matmul_6_transpose_x_0 = const()[name = tensor<string, []>("matmul_6_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_6_transpose_y_0 = const()[name = tensor<string, []>("matmul_6_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 16384]> transpose_13_to_fp16 = const()[name = tensor<string, []>("transpose_13_to_fp16"), val = tensor<fp16, [3072, 16384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(603986624)))];
+            tensor<fp16, [64, 16384]> matmul_6_cast_fp16 = matmul(transpose_x = matmul_6_transpose_x_0, transpose_y = matmul_6_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_13_to_fp16)[name = tensor<string, []>("matmul_6_cast_fp16")];
+            tensor<int32, [3]> concat_56 = const()[name = tensor<string, []>("concat_56"), val = tensor<int32, [3]>([1, 64, 16384])];
+            tensor<fp16, [1, 64, 16384]> logits_6 = reshape(shape = concat_56, x = matmul_6_cast_fp16)[name = tensor<string, []>("reshape_20_cast_fp16")];
+            tensor<bool, []> matmul_7_transpose_x_0 = const()[name = tensor<string, []>("matmul_7_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> matmul_7_transpose_y_0 = const()[name = tensor<string, []>("matmul_7_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [3072, 13568]> transpose_15_to_fp16 = const()[name = tensor<string, []>("transpose_15_to_fp16"), val = tensor<fp16, [3072, 13568]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(704649984)))];
+            tensor<fp16, [64, 13568]> matmul_7_cast_fp16 = matmul(transpose_x = matmul_7_transpose_x_0, transpose_y = matmul_7_transpose_y_0, x = reshape_0_cast_fp16, y = transpose_15_to_fp16)[name = tensor<string, []>("matmul_7_cast_fp16")];
+            tensor<int32, [3]> concat_64 = const()[name = tensor<string, []>("concat_64"), val = tensor<int32, [3]>([1, 64, 13568])];
+            tensor<fp16, [1, 64, 13568]> logits_7 = reshape(shape = concat_64, x = matmul_7_cast_fp16)[name = tensor<string, []>("reshape_23_cast_fp16")];
+        } -> (logits_0, logits_1, logits_2, logits_3, logits_4, logits_5, logits_6, logits_7);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk16.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6c17545ff590d111e236c588b117c45f2c80f0dc
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk16.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fd01e5605ef4a5bebc9bcafc5514e012c4c605231c065a1d634fd9fc66df411
+size 788011840
diff --git a/Llama-3.2-3B-Instruct_chunk2.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk2.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fed05170d981b8582c9421ec7550f748512caf2
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b45f96f9ba201e16f197a78412041f41d2ac869df9ad95ef03af7662e7d940
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk2.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..d412b526b988fdf6d178a4c78fced9a569ebadcf
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk2",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk2.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk2.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9f8251fc3213f2ba5fe5d32a78480bd28428d83d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk2.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d49db7568f50fc8f361e2b4cca2888853752a4784bf7a44b06842bebf37ed2b
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk3.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk3.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk3.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d7f3b5135fb936c65404b61a976a65ba64ccdf7
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk3",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk3.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk3.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ddc414b805bbeaeee2f33bbeffef46d892a80c51
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk3.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:934f835704cd4576365155fea7f05c7308ec2dc8b0c69d6d800fdc6e646ea0ce
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk4.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk4.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fed05170d981b8582c9421ec7550f748512caf2
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b45f96f9ba201e16f197a78412041f41d2ac869df9ad95ef03af7662e7d940
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk4.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..078216a203d40b09afa365c7afa972ffa9fe3493
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk4",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk4.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk4.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a95364215864b82077bb488e2e8659af191e985a
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk4.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c5e404a08c1b8eb56e7784313a6ca21b01bc54978e598f1b184029e8f613e43
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk5.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk5.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk5.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc528f3f5b0976cb12d6bc6af184b91a12c1efa4
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk5",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk5.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk5.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5c591a4d84403719c2261df43f73889f84514e4b
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk5.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65a458c63e278bc37e0b52a60e57c256764913cdd7e72c783179a10c9df8a554
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk6.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk6.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk6.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..88ef5c7d29a3edb397ad09f9fbc49d61a6194f0d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk6",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk6.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk6.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d35abe65b9bcbba35955389a8d068fb90b334461
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk6.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a03a9e7bc84b5fedc7469722a7cad217dff44099167118db3559a706ff7b701
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk7.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk7.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk7.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..50c03439bcb5ac3b637aea87cd1a35808af38d03
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk7",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk7.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk7.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4e755d1ce103aa4cf05bef7c1ab7593f30ea61d7
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk7.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0c78f79c19079e49b7985ac1ce866746a185e629e010b7c2536ed467b24564
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk8.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk8.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3fed05170d981b8582c9421ec7550f748512caf2
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55b45f96f9ba201e16f197a78412041f41d2ac869df9ad95ef03af7662e7d940
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk8.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf8bd9cb9c766b10c198630e7b6b8aa3a1a050a1
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk8",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk8.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk8.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e676f21da72dcaa7df5a68e36aafc518dd39b25c
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk8.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22a969fcfd92d31cf91e96a0621eb9737df6c6dfe11175f1ab9b7138f88a88ac
+size 402679744
diff --git a/Llama-3.2-3B-Instruct_chunk9.mlmodelc/analytics/coremldata.bin b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6a63af39cde8e590e41fffd270ab8aede737490d
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf21e446e7587de3fd840eae95f3e79729298df568725552f7ef5fd8f954e58c
+size 243
diff --git a/Llama-3.2-3B-Instruct_chunk9.mlmodelc/coremldata.bin b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ef844658693d8a7fc2951abf2761f8f5f9bc62c3
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8129d684aa1ea8b76708a186fe44f7ffc4aa08b4854907105fe41c0825e71875
+size 653
diff --git a/Llama-3.2-3B-Instruct_chunk9.mlmodelc/metadata.json b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8c72890b21d6a9ba891839a3cf16aff57935ee8
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/metadata.json
@@ -0,0 +1,178 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "new_x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Concat" : 14,
+      "Ios16.mul" : 70,
+      "SliceByIndex" : 88,
+      "Transpose" : 2,
+      "Ios16.einsum" : 96,
+      "Ios16.conv" : 14,
+      "Ios16.add" : 56,
+      "Ios16.realDiv" : 4,
+      "Ios16.softmax" : 48,
+      "Ios16.reduceL2Norm" : 4,
+      "Ios16.reshape" : 14,
+      "Ios16.silu" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 3072 × 8 × 8)",
+        "shortDescription" : "",
+        "shape" : "[1, 3072, 8, 8]",
+        "name" : "x",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "cos",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 128 × 64)",
+        "shortDescription" : "",
+        "shape" : "[128, 64]",
+        "name" : "sin",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 512 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1, 64]",
+        "name" : "mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)?",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "k_cache_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "1",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)?",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "v_cache_1",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "Llama_3_2_3B_Instruct_2024_11_09_16_14_37_chunk9",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk9.mlmodelc/model.mil b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..78594b4291dc45ae43652f9a31200581b19ad3c6
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/model.mil
@@ -0,0 +1,956 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [128, 64]> cos, tensor<fp16, [1, 448, 1, 1024]> k_cache_0, tensor<fp16, [1, 448, 1, 1024]> k_cache_1, tensor<fp16, [1, 512, 1, 64]> mask, tensor<fp16, [128, 64]> sin, tensor<fp16, [1, 1024, 1, 448]> v_cache_0, tensor<fp16, [1, 1024, 1, 448]> v_cache_1, tensor<fp16, [1, 3072, 8, 8]> x) [CoreML_InputDefaultValues = dict<tensor<string, []>, tensor<fp32, []>>({{"k_cache_0", 0}, {"k_cache_1", 0}, {"v_cache_0", 0}, {"v_cache_1", 0}})] {
+            tensor<int32, []> var_13 = const()[name = tensor<string, []>("op_13"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_17 = const()[name = tensor<string, []>("op_17"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_19 = const()[name = tensor<string, []>("op_19"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_55 = const()[name = tensor<string, []>("op_55"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_1_interleave_0 = const()[name = tensor<string, []>("x_eps_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_1_to_fp16 = const()[name = tensor<string, []>("eps_chan_1_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_1_cast_fp16 = concat(axis = var_52, interleave = x_eps_1_interleave_0, values = (x, eps_chan_1_to_fp16))[name = tensor<string, []>("x_eps_1_cast_fp16")];
+            tensor<int32, [1]> norm_x_1_axes_0 = const()[name = tensor<string, []>("norm_x_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_1_cast_fp16 = reduce_l2_norm(axes = norm_x_1_axes_0, keep_dims = var_55, x = x_eps_1_cast_fp16)[name = tensor<string, []>("norm_x_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_1_cast_fp16 = real_div(x = x, y = norm_x_1_cast_fp16)[name = tensor<string, []>("x_normed_1_cast_fp16")];
+            tensor<fp16, []> var_79_to_fp16 = const()[name = tensor<string, []>("op_79_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_3_cast_fp16 = mul(x = x_normed_1_cast_fp16, y = var_79_to_fp16)[name = tensor<string, []>("x_normed_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_5_cast_fp16 = mul(x = x_normed_3_cast_fp16, y = blocks_0_norm_1_weight_to_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
+            tensor<int32, [4]> var_100 = const()[name = tensor<string, []>("op_100"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_1_cast_fp16 = reshape(shape = var_100, x = x_5_cast_fp16)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [2]> var_103 = const()[name = tensor<string, []>("op_103"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_105 = const()[name = tensor<string, []>("op_105"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_1_pad_type_0 = const()[name = tensor<string, []>("q_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = tensor<string, []>("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6464)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_1_cast_fp16 = conv(dilations = var_105, groups = var_52, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = var_103, weight = blocks_0_attn_q_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("q_1_cast_fp16")];
+            tensor<int32, [2]> var_109 = const()[name = tensor<string, []>("op_109"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_111 = const()[name = tensor<string, []>("op_111"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_1_pad_type_0 = const()[name = tensor<string, []>("k_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = tensor<string, []>("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18880896)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_1_cast_fp16 = conv(dilations = var_111, groups = var_52, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = var_109, weight = blocks_0_attn_k_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("k_1_cast_fp16")];
+            tensor<int32, [2]> var_115 = const()[name = tensor<string, []>("op_115"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_117 = const()[name = tensor<string, []>("op_117"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_1_pad_type_0 = const()[name = tensor<string, []>("v_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = tensor<string, []>("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_0_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25172416)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_1_cast_fp16 = conv(dilations = var_117, groups = var_52, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = var_115, weight = blocks_0_attn_v_proj_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("v_1_cast_fp16")];
+            tensor<int32, [4]> var_120 = const()[name = tensor<string, []>("op_120"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_3_cast_fp16 = reshape(shape = var_120, x = q_1_cast_fp16)[name = tensor<string, []>("q_3_cast_fp16")];
+            tensor<int32, [4]> var_122 = const()[name = tensor<string, []>("op_122"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_3_cast_fp16 = reshape(shape = var_122, x = k_1_cast_fp16)[name = tensor<string, []>("k_3_cast_fp16")];
+            tensor<int32, [4]> var_136_begin_0 = const()[name = tensor<string, []>("op_136_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_136_end_0 = const()[name = tensor<string, []>("op_136_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_136_end_mask_0 = const()[name = tensor<string, []>("op_136_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_136_cast_fp16 = slice_by_index(begin = var_136_begin_0, end = var_136_end_0, end_mask = var_136_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_136_cast_fp16")];
+            tensor<int32, [4]> var_142_begin_0 = const()[name = tensor<string, []>("op_142_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_142_end_0 = const()[name = tensor<string, []>("op_142_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_142_end_mask_0 = const()[name = tensor<string, []>("op_142_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_142_cast_fp16 = slice_by_index(begin = var_142_begin_0, end = var_142_end_0, end_mask = var_142_end_mask_0, x = q_3_cast_fp16)[name = tensor<string, []>("op_142_cast_fp16")];
+            tensor<fp16, []> const_10_promoted_to_fp16 = const()[name = tensor<string, []>("const_10_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_144_cast_fp16 = mul(x = var_142_cast_fp16, y = const_10_promoted_to_fp16)[name = tensor<string, []>("op_144_cast_fp16")];
+            tensor<bool, []> rotated_1_interleave_0 = const()[name = tensor<string, []>("rotated_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_1_cast_fp16 = concat(axis = var_17, interleave = rotated_1_interleave_0, values = (var_144_cast_fp16, var_136_cast_fp16))[name = tensor<string, []>("rotated_1_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_147_cast_fp16 = mul(x = q_3_cast_fp16, y = cos)[name = tensor<string, []>("op_147_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_148_cast_fp16 = mul(x = rotated_1_cast_fp16, y = sin)[name = tensor<string, []>("op_148_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_1_cast_fp16 = add(x = var_147_cast_fp16, y = var_148_cast_fp16)[name = tensor<string, []>("roped_1_cast_fp16")];
+            tensor<int32, [4]> var_161_begin_0 = const()[name = tensor<string, []>("op_161_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_161_end_0 = const()[name = tensor<string, []>("op_161_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_161_end_mask_0 = const()[name = tensor<string, []>("op_161_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_161_cast_fp16 = slice_by_index(begin = var_161_begin_0, end = var_161_end_0, end_mask = var_161_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_161_cast_fp16")];
+            tensor<int32, [4]> var_167_begin_0 = const()[name = tensor<string, []>("op_167_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_167_end_0 = const()[name = tensor<string, []>("op_167_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_167_end_mask_0 = const()[name = tensor<string, []>("op_167_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_167_cast_fp16 = slice_by_index(begin = var_167_begin_0, end = var_167_end_0, end_mask = var_167_end_mask_0, x = k_3_cast_fp16)[name = tensor<string, []>("op_167_cast_fp16")];
+            tensor<fp16, []> const_12_promoted_to_fp16 = const()[name = tensor<string, []>("const_12_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_169_cast_fp16 = mul(x = var_167_cast_fp16, y = const_12_promoted_to_fp16)[name = tensor<string, []>("op_169_cast_fp16")];
+            tensor<bool, []> rotated_3_interleave_0 = const()[name = tensor<string, []>("rotated_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_3_cast_fp16 = concat(axis = var_17, interleave = rotated_3_interleave_0, values = (var_169_cast_fp16, var_161_cast_fp16))[name = tensor<string, []>("rotated_3_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_172_cast_fp16 = mul(x = k_3_cast_fp16, y = cos)[name = tensor<string, []>("op_172_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_173_cast_fp16 = mul(x = rotated_3_cast_fp16, y = sin)[name = tensor<string, []>("op_173_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_3_cast_fp16 = add(x = var_172_cast_fp16, y = var_173_cast_fp16)[name = tensor<string, []>("roped_3_cast_fp16")];
+            tensor<int32, [4]> var_176 = const()[name = tensor<string, []>("op_176"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_7_cast_fp16 = reshape(shape = var_176, x = roped_3_cast_fp16)[name = tensor<string, []>("k_7_cast_fp16")];
+            tensor<int32, [4]> var_178 = const()[name = tensor<string, []>("op_178"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_0 = reshape(shape = var_178, x = v_1_cast_fp16)[name = tensor<string, []>("new_v_cache_0_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_9_perm_0 = const()[name = tensor<string, []>("k_9_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_11_interleave_0 = const()[name = tensor<string, []>("k_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_0 = transpose(perm = k_9_perm_0, x = k_7_cast_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 512, 1, 1024]> k_11_cast_fp16 = concat(axis = var_19, interleave = k_11_interleave_0, values = (k_cache_0, new_k_cache_0))[name = tensor<string, []>("k_11_cast_fp16")];
+            tensor<bool, []> v_7_interleave_0 = const()[name = tensor<string, []>("v_7_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_7_cast_fp16 = concat(axis = var_13, interleave = v_7_interleave_0, values = (v_cache_0, new_v_cache_0))[name = tensor<string, []>("v_7_cast_fp16")];
+            tensor<int32, [4]> var_186 = const()[name = tensor<string, []>("op_186"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_7_cast_fp16 = reshape(shape = var_186, x = roped_1_cast_fp16)[name = tensor<string, []>("q_7_cast_fp16")];
+            tensor<int32, [4]> var_191_begin_0 = const()[name = tensor<string, []>("op_191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_191_end_0 = const()[name = tensor<string, []>("op_191_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_191_end_mask_0 = const()[name = tensor<string, []>("op_191_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_191_cast_fp16 = slice_by_index(begin = var_191_begin_0, end = var_191_end_0, end_mask = var_191_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_191_cast_fp16")];
+            tensor<int32, [4]> var_195_begin_0 = const()[name = tensor<string, []>("op_195_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_195_end_0 = const()[name = tensor<string, []>("op_195_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_195_end_mask_0 = const()[name = tensor<string, []>("op_195_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_195_cast_fp16 = slice_by_index(begin = var_195_begin_0, end = var_195_end_0, end_mask = var_195_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_195_cast_fp16")];
+            tensor<int32, [4]> var_199_begin_0 = const()[name = tensor<string, []>("op_199_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_199_end_0 = const()[name = tensor<string, []>("op_199_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_199_end_mask_0 = const()[name = tensor<string, []>("op_199_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_199_cast_fp16 = slice_by_index(begin = var_199_begin_0, end = var_199_end_0, end_mask = var_199_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_199_cast_fp16")];
+            tensor<int32, [4]> var_203_begin_0 = const()[name = tensor<string, []>("op_203_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_203_end_0 = const()[name = tensor<string, []>("op_203_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_203_end_mask_0 = const()[name = tensor<string, []>("op_203_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_203_cast_fp16 = slice_by_index(begin = var_203_begin_0, end = var_203_end_0, end_mask = var_203_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_203_cast_fp16")];
+            tensor<int32, [4]> var_207_begin_0 = const()[name = tensor<string, []>("op_207_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_207_end_0 = const()[name = tensor<string, []>("op_207_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_207_end_mask_0 = const()[name = tensor<string, []>("op_207_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_207_cast_fp16 = slice_by_index(begin = var_207_begin_0, end = var_207_end_0, end_mask = var_207_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_207_cast_fp16")];
+            tensor<int32, [4]> var_211_begin_0 = const()[name = tensor<string, []>("op_211_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_211_end_0 = const()[name = tensor<string, []>("op_211_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_211_end_mask_0 = const()[name = tensor<string, []>("op_211_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_211_cast_fp16 = slice_by_index(begin = var_211_begin_0, end = var_211_end_0, end_mask = var_211_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_211_cast_fp16")];
+            tensor<int32, [4]> var_215_begin_0 = const()[name = tensor<string, []>("op_215_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_215_end_0 = const()[name = tensor<string, []>("op_215_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_215_end_mask_0 = const()[name = tensor<string, []>("op_215_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_215_cast_fp16 = slice_by_index(begin = var_215_begin_0, end = var_215_end_0, end_mask = var_215_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
+            tensor<int32, [4]> var_219_begin_0 = const()[name = tensor<string, []>("op_219_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_219_end_0 = const()[name = tensor<string, []>("op_219_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_219_end_mask_0 = const()[name = tensor<string, []>("op_219_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_219_cast_fp16 = slice_by_index(begin = var_219_begin_0, end = var_219_end_0, end_mask = var_219_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_219_cast_fp16")];
+            tensor<int32, [4]> var_223_begin_0 = const()[name = tensor<string, []>("op_223_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_223_end_0 = const()[name = tensor<string, []>("op_223_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_223_end_mask_0 = const()[name = tensor<string, []>("op_223_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_223_cast_fp16 = slice_by_index(begin = var_223_begin_0, end = var_223_end_0, end_mask = var_223_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_223_cast_fp16")];
+            tensor<int32, [4]> var_227_begin_0 = const()[name = tensor<string, []>("op_227_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_227_end_0 = const()[name = tensor<string, []>("op_227_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_227_end_mask_0 = const()[name = tensor<string, []>("op_227_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_227_cast_fp16 = slice_by_index(begin = var_227_begin_0, end = var_227_end_0, end_mask = var_227_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_227_cast_fp16")];
+            tensor<int32, [4]> var_231_begin_0 = const()[name = tensor<string, []>("op_231_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_231_end_0 = const()[name = tensor<string, []>("op_231_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_231_end_mask_0 = const()[name = tensor<string, []>("op_231_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_231_cast_fp16 = slice_by_index(begin = var_231_begin_0, end = var_231_end_0, end_mask = var_231_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_231_cast_fp16")];
+            tensor<int32, [4]> var_235_begin_0 = const()[name = tensor<string, []>("op_235_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_235_end_0 = const()[name = tensor<string, []>("op_235_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_235_end_mask_0 = const()[name = tensor<string, []>("op_235_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_235_cast_fp16 = slice_by_index(begin = var_235_begin_0, end = var_235_end_0, end_mask = var_235_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_235_cast_fp16")];
+            tensor<int32, [4]> var_239_begin_0 = const()[name = tensor<string, []>("op_239_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_239_end_0 = const()[name = tensor<string, []>("op_239_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_239_end_mask_0 = const()[name = tensor<string, []>("op_239_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_239_cast_fp16 = slice_by_index(begin = var_239_begin_0, end = var_239_end_0, end_mask = var_239_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_239_cast_fp16")];
+            tensor<int32, [4]> var_243_begin_0 = const()[name = tensor<string, []>("op_243_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_243_end_0 = const()[name = tensor<string, []>("op_243_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_243_end_mask_0 = const()[name = tensor<string, []>("op_243_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_243_cast_fp16 = slice_by_index(begin = var_243_begin_0, end = var_243_end_0, end_mask = var_243_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_243_cast_fp16")];
+            tensor<int32, [4]> var_247_begin_0 = const()[name = tensor<string, []>("op_247_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_247_end_0 = const()[name = tensor<string, []>("op_247_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_247_end_mask_0 = const()[name = tensor<string, []>("op_247_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_247_cast_fp16 = slice_by_index(begin = var_247_begin_0, end = var_247_end_0, end_mask = var_247_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_247_cast_fp16")];
+            tensor<int32, [4]> var_251_begin_0 = const()[name = tensor<string, []>("op_251_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_251_end_0 = const()[name = tensor<string, []>("op_251_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_251_end_mask_0 = const()[name = tensor<string, []>("op_251_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_251_cast_fp16 = slice_by_index(begin = var_251_begin_0, end = var_251_end_0, end_mask = var_251_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_251_cast_fp16")];
+            tensor<int32, [4]> var_255_begin_0 = const()[name = tensor<string, []>("op_255_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_255_end_0 = const()[name = tensor<string, []>("op_255_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_255_end_mask_0 = const()[name = tensor<string, []>("op_255_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_255_cast_fp16 = slice_by_index(begin = var_255_begin_0, end = var_255_end_0, end_mask = var_255_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_255_cast_fp16")];
+            tensor<int32, [4]> var_259_begin_0 = const()[name = tensor<string, []>("op_259_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_259_end_0 = const()[name = tensor<string, []>("op_259_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_259_end_mask_0 = const()[name = tensor<string, []>("op_259_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_259_cast_fp16 = slice_by_index(begin = var_259_begin_0, end = var_259_end_0, end_mask = var_259_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
+            tensor<int32, [4]> var_263_begin_0 = const()[name = tensor<string, []>("op_263_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_263_end_0 = const()[name = tensor<string, []>("op_263_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_263_end_mask_0 = const()[name = tensor<string, []>("op_263_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_263_cast_fp16 = slice_by_index(begin = var_263_begin_0, end = var_263_end_0, end_mask = var_263_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_263_cast_fp16")];
+            tensor<int32, [4]> var_267_begin_0 = const()[name = tensor<string, []>("op_267_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_267_end_0 = const()[name = tensor<string, []>("op_267_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_267_end_mask_0 = const()[name = tensor<string, []>("op_267_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_267_cast_fp16 = slice_by_index(begin = var_267_begin_0, end = var_267_end_0, end_mask = var_267_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_267_cast_fp16")];
+            tensor<int32, [4]> var_271_begin_0 = const()[name = tensor<string, []>("op_271_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_271_end_0 = const()[name = tensor<string, []>("op_271_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_271_end_mask_0 = const()[name = tensor<string, []>("op_271_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_271_cast_fp16 = slice_by_index(begin = var_271_begin_0, end = var_271_end_0, end_mask = var_271_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_271_cast_fp16")];
+            tensor<int32, [4]> var_275_begin_0 = const()[name = tensor<string, []>("op_275_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_275_end_0 = const()[name = tensor<string, []>("op_275_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_275_end_mask_0 = const()[name = tensor<string, []>("op_275_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_275_cast_fp16 = slice_by_index(begin = var_275_begin_0, end = var_275_end_0, end_mask = var_275_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_275_cast_fp16")];
+            tensor<int32, [4]> var_279_begin_0 = const()[name = tensor<string, []>("op_279_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_279_end_0 = const()[name = tensor<string, []>("op_279_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_279_end_mask_0 = const()[name = tensor<string, []>("op_279_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_279_cast_fp16 = slice_by_index(begin = var_279_begin_0, end = var_279_end_0, end_mask = var_279_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_279_cast_fp16")];
+            tensor<int32, [4]> var_283_begin_0 = const()[name = tensor<string, []>("op_283_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_283_end_0 = const()[name = tensor<string, []>("op_283_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_283_end_mask_0 = const()[name = tensor<string, []>("op_283_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_283_cast_fp16 = slice_by_index(begin = var_283_begin_0, end = var_283_end_0, end_mask = var_283_end_mask_0, x = q_7_cast_fp16)[name = tensor<string, []>("op_283_cast_fp16")];
+            tensor<int32, [4]> var_289_begin_0 = const()[name = tensor<string, []>("op_289_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_289_end_0 = const()[name = tensor<string, []>("op_289_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_289_end_mask_0 = const()[name = tensor<string, []>("op_289_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_289_cast_fp16 = slice_by_index(begin = var_289_begin_0, end = var_289_end_0, end_mask = var_289_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_289_cast_fp16")];
+            tensor<int32, [4]> var_301_begin_0 = const()[name = tensor<string, []>("op_301_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_301_end_0 = const()[name = tensor<string, []>("op_301_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_301_end_mask_0 = const()[name = tensor<string, []>("op_301_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_301_cast_fp16 = slice_by_index(begin = var_301_begin_0, end = var_301_end_0, end_mask = var_301_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_301_cast_fp16")];
+            tensor<int32, [4]> var_313_begin_0 = const()[name = tensor<string, []>("op_313_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_313_end_0 = const()[name = tensor<string, []>("op_313_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_313_end_mask_0 = const()[name = tensor<string, []>("op_313_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_313_cast_fp16 = slice_by_index(begin = var_313_begin_0, end = var_313_end_0, end_mask = var_313_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_313_cast_fp16")];
+            tensor<int32, [4]> var_325_begin_0 = const()[name = tensor<string, []>("op_325_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_325_end_0 = const()[name = tensor<string, []>("op_325_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_325_end_mask_0 = const()[name = tensor<string, []>("op_325_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_325_cast_fp16 = slice_by_index(begin = var_325_begin_0, end = var_325_end_0, end_mask = var_325_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_325_cast_fp16")];
+            tensor<int32, [4]> var_337_begin_0 = const()[name = tensor<string, []>("op_337_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_337_end_0 = const()[name = tensor<string, []>("op_337_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_337_end_mask_0 = const()[name = tensor<string, []>("op_337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_337_cast_fp16 = slice_by_index(begin = var_337_begin_0, end = var_337_end_0, end_mask = var_337_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_337_cast_fp16")];
+            tensor<int32, [4]> var_349_begin_0 = const()[name = tensor<string, []>("op_349_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_349_end_0 = const()[name = tensor<string, []>("op_349_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_349_end_mask_0 = const()[name = tensor<string, []>("op_349_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_349_cast_fp16 = slice_by_index(begin = var_349_begin_0, end = var_349_end_0, end_mask = var_349_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_349_cast_fp16")];
+            tensor<int32, [4]> var_361_begin_0 = const()[name = tensor<string, []>("op_361_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_361_end_0 = const()[name = tensor<string, []>("op_361_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_361_end_mask_0 = const()[name = tensor<string, []>("op_361_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_361_cast_fp16 = slice_by_index(begin = var_361_begin_0, end = var_361_end_0, end_mask = var_361_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_361_cast_fp16")];
+            tensor<int32, [4]> var_373_begin_0 = const()[name = tensor<string, []>("op_373_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_373_end_0 = const()[name = tensor<string, []>("op_373_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_373_end_mask_0 = const()[name = tensor<string, []>("op_373_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_373_cast_fp16 = slice_by_index(begin = var_373_begin_0, end = var_373_end_0, end_mask = var_373_end_mask_0, x = k_11_cast_fp16)[name = tensor<string, []>("op_373_cast_fp16")];
+            tensor<int32, [4]> var_383_begin_0 = const()[name = tensor<string, []>("op_383_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_383_end_0 = const()[name = tensor<string, []>("op_383_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_383_end_mask_0 = const()[name = tensor<string, []>("op_383_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_383_cast_fp16 = slice_by_index(begin = var_383_begin_0, end = var_383_end_0, end_mask = var_383_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_383_cast_fp16")];
+            tensor<int32, [4]> var_395_begin_0 = const()[name = tensor<string, []>("op_395_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_395_end_0 = const()[name = tensor<string, []>("op_395_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_395_end_mask_0 = const()[name = tensor<string, []>("op_395_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_395_cast_fp16 = slice_by_index(begin = var_395_begin_0, end = var_395_end_0, end_mask = var_395_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_395_cast_fp16")];
+            tensor<int32, [4]> var_407_begin_0 = const()[name = tensor<string, []>("op_407_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_407_end_0 = const()[name = tensor<string, []>("op_407_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_407_end_mask_0 = const()[name = tensor<string, []>("op_407_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_407_cast_fp16 = slice_by_index(begin = var_407_begin_0, end = var_407_end_0, end_mask = var_407_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
+            tensor<int32, [4]> var_419_begin_0 = const()[name = tensor<string, []>("op_419_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_419_end_0 = const()[name = tensor<string, []>("op_419_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_419_end_mask_0 = const()[name = tensor<string, []>("op_419_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_419_cast_fp16 = slice_by_index(begin = var_419_begin_0, end = var_419_end_0, end_mask = var_419_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_419_cast_fp16")];
+            tensor<int32, [4]> var_431_begin_0 = const()[name = tensor<string, []>("op_431_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_431_end_0 = const()[name = tensor<string, []>("op_431_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_431_end_mask_0 = const()[name = tensor<string, []>("op_431_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_431_cast_fp16 = slice_by_index(begin = var_431_begin_0, end = var_431_end_0, end_mask = var_431_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_431_cast_fp16")];
+            tensor<int32, [4]> var_443_begin_0 = const()[name = tensor<string, []>("op_443_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_443_end_0 = const()[name = tensor<string, []>("op_443_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_443_end_mask_0 = const()[name = tensor<string, []>("op_443_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_443_cast_fp16 = slice_by_index(begin = var_443_begin_0, end = var_443_end_0, end_mask = var_443_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_443_cast_fp16")];
+            tensor<int32, [4]> var_455_begin_0 = const()[name = tensor<string, []>("op_455_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_455_end_0 = const()[name = tensor<string, []>("op_455_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_455_end_mask_0 = const()[name = tensor<string, []>("op_455_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_455_cast_fp16 = slice_by_index(begin = var_455_begin_0, end = var_455_end_0, end_mask = var_455_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_455_cast_fp16")];
+            tensor<int32, [4]> var_467_begin_0 = const()[name = tensor<string, []>("op_467_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_467_end_0 = const()[name = tensor<string, []>("op_467_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_467_end_mask_0 = const()[name = tensor<string, []>("op_467_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_467_cast_fp16 = slice_by_index(begin = var_467_begin_0, end = var_467_end_0, end_mask = var_467_end_mask_0, x = v_7_cast_fp16)[name = tensor<string, []>("op_467_cast_fp16")];
+            tensor<string, []> var_479_equation_0 = const()[name = tensor<string, []>("op_479_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_479_cast_fp16 = einsum(equation = var_479_equation_0, values = (var_289_cast_fp16, var_191_cast_fp16))[name = tensor<string, []>("op_479_cast_fp16")];
+            tensor<fp16, []> var_480_to_fp16 = const()[name = tensor<string, []>("op_480_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_481_cast_fp16 = mul(x = var_479_cast_fp16, y = var_480_to_fp16)[name = tensor<string, []>("op_481_cast_fp16")];
+            tensor<string, []> var_483_equation_0 = const()[name = tensor<string, []>("op_483_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_483_cast_fp16 = einsum(equation = var_483_equation_0, values = (var_289_cast_fp16, var_195_cast_fp16))[name = tensor<string, []>("op_483_cast_fp16")];
+            tensor<fp16, []> var_484_to_fp16 = const()[name = tensor<string, []>("op_484_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_485_cast_fp16 = mul(x = var_483_cast_fp16, y = var_484_to_fp16)[name = tensor<string, []>("op_485_cast_fp16")];
+            tensor<string, []> var_487_equation_0 = const()[name = tensor<string, []>("op_487_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_487_cast_fp16 = einsum(equation = var_487_equation_0, values = (var_289_cast_fp16, var_199_cast_fp16))[name = tensor<string, []>("op_487_cast_fp16")];
+            tensor<fp16, []> var_488_to_fp16 = const()[name = tensor<string, []>("op_488_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_489_cast_fp16 = mul(x = var_487_cast_fp16, y = var_488_to_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
+            tensor<string, []> var_491_equation_0 = const()[name = tensor<string, []>("op_491_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_491_cast_fp16 = einsum(equation = var_491_equation_0, values = (var_301_cast_fp16, var_203_cast_fp16))[name = tensor<string, []>("op_491_cast_fp16")];
+            tensor<fp16, []> var_492_to_fp16 = const()[name = tensor<string, []>("op_492_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_493_cast_fp16 = mul(x = var_491_cast_fp16, y = var_492_to_fp16)[name = tensor<string, []>("op_493_cast_fp16")];
+            tensor<string, []> var_495_equation_0 = const()[name = tensor<string, []>("op_495_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_495_cast_fp16 = einsum(equation = var_495_equation_0, values = (var_301_cast_fp16, var_207_cast_fp16))[name = tensor<string, []>("op_495_cast_fp16")];
+            tensor<fp16, []> var_496_to_fp16 = const()[name = tensor<string, []>("op_496_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_497_cast_fp16 = mul(x = var_495_cast_fp16, y = var_496_to_fp16)[name = tensor<string, []>("op_497_cast_fp16")];
+            tensor<string, []> var_499_equation_0 = const()[name = tensor<string, []>("op_499_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_499_cast_fp16 = einsum(equation = var_499_equation_0, values = (var_301_cast_fp16, var_211_cast_fp16))[name = tensor<string, []>("op_499_cast_fp16")];
+            tensor<fp16, []> var_500_to_fp16 = const()[name = tensor<string, []>("op_500_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_501_cast_fp16 = mul(x = var_499_cast_fp16, y = var_500_to_fp16)[name = tensor<string, []>("op_501_cast_fp16")];
+            tensor<string, []> var_503_equation_0 = const()[name = tensor<string, []>("op_503_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_503_cast_fp16 = einsum(equation = var_503_equation_0, values = (var_313_cast_fp16, var_215_cast_fp16))[name = tensor<string, []>("op_503_cast_fp16")];
+            tensor<fp16, []> var_504_to_fp16 = const()[name = tensor<string, []>("op_504_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_505_cast_fp16 = mul(x = var_503_cast_fp16, y = var_504_to_fp16)[name = tensor<string, []>("op_505_cast_fp16")];
+            tensor<string, []> var_507_equation_0 = const()[name = tensor<string, []>("op_507_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_507_cast_fp16 = einsum(equation = var_507_equation_0, values = (var_313_cast_fp16, var_219_cast_fp16))[name = tensor<string, []>("op_507_cast_fp16")];
+            tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_509_cast_fp16 = mul(x = var_507_cast_fp16, y = var_508_to_fp16)[name = tensor<string, []>("op_509_cast_fp16")];
+            tensor<string, []> var_511_equation_0 = const()[name = tensor<string, []>("op_511_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_511_cast_fp16 = einsum(equation = var_511_equation_0, values = (var_313_cast_fp16, var_223_cast_fp16))[name = tensor<string, []>("op_511_cast_fp16")];
+            tensor<fp16, []> var_512_to_fp16 = const()[name = tensor<string, []>("op_512_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_513_cast_fp16 = mul(x = var_511_cast_fp16, y = var_512_to_fp16)[name = tensor<string, []>("op_513_cast_fp16")];
+            tensor<string, []> var_515_equation_0 = const()[name = tensor<string, []>("op_515_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_515_cast_fp16 = einsum(equation = var_515_equation_0, values = (var_325_cast_fp16, var_227_cast_fp16))[name = tensor<string, []>("op_515_cast_fp16")];
+            tensor<fp16, []> var_516_to_fp16 = const()[name = tensor<string, []>("op_516_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_517_cast_fp16 = mul(x = var_515_cast_fp16, y = var_516_to_fp16)[name = tensor<string, []>("op_517_cast_fp16")];
+            tensor<string, []> var_519_equation_0 = const()[name = tensor<string, []>("op_519_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_519_cast_fp16 = einsum(equation = var_519_equation_0, values = (var_325_cast_fp16, var_231_cast_fp16))[name = tensor<string, []>("op_519_cast_fp16")];
+            tensor<fp16, []> var_520_to_fp16 = const()[name = tensor<string, []>("op_520_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_521_cast_fp16 = mul(x = var_519_cast_fp16, y = var_520_to_fp16)[name = tensor<string, []>("op_521_cast_fp16")];
+            tensor<string, []> var_523_equation_0 = const()[name = tensor<string, []>("op_523_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_523_cast_fp16 = einsum(equation = var_523_equation_0, values = (var_325_cast_fp16, var_235_cast_fp16))[name = tensor<string, []>("op_523_cast_fp16")];
+            tensor<fp16, []> var_524_to_fp16 = const()[name = tensor<string, []>("op_524_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_525_cast_fp16 = mul(x = var_523_cast_fp16, y = var_524_to_fp16)[name = tensor<string, []>("op_525_cast_fp16")];
+            tensor<string, []> var_527_equation_0 = const()[name = tensor<string, []>("op_527_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_527_cast_fp16 = einsum(equation = var_527_equation_0, values = (var_337_cast_fp16, var_239_cast_fp16))[name = tensor<string, []>("op_527_cast_fp16")];
+            tensor<fp16, []> var_528_to_fp16 = const()[name = tensor<string, []>("op_528_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_529_cast_fp16 = mul(x = var_527_cast_fp16, y = var_528_to_fp16)[name = tensor<string, []>("op_529_cast_fp16")];
+            tensor<string, []> var_531_equation_0 = const()[name = tensor<string, []>("op_531_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_531_cast_fp16 = einsum(equation = var_531_equation_0, values = (var_337_cast_fp16, var_243_cast_fp16))[name = tensor<string, []>("op_531_cast_fp16")];
+            tensor<fp16, []> var_532_to_fp16 = const()[name = tensor<string, []>("op_532_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_533_cast_fp16 = mul(x = var_531_cast_fp16, y = var_532_to_fp16)[name = tensor<string, []>("op_533_cast_fp16")];
+            tensor<string, []> var_535_equation_0 = const()[name = tensor<string, []>("op_535_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_535_cast_fp16 = einsum(equation = var_535_equation_0, values = (var_337_cast_fp16, var_247_cast_fp16))[name = tensor<string, []>("op_535_cast_fp16")];
+            tensor<fp16, []> var_536_to_fp16 = const()[name = tensor<string, []>("op_536_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_537_cast_fp16 = mul(x = var_535_cast_fp16, y = var_536_to_fp16)[name = tensor<string, []>("op_537_cast_fp16")];
+            tensor<string, []> var_539_equation_0 = const()[name = tensor<string, []>("op_539_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_539_cast_fp16 = einsum(equation = var_539_equation_0, values = (var_349_cast_fp16, var_251_cast_fp16))[name = tensor<string, []>("op_539_cast_fp16")];
+            tensor<fp16, []> var_540_to_fp16 = const()[name = tensor<string, []>("op_540_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_541_cast_fp16 = mul(x = var_539_cast_fp16, y = var_540_to_fp16)[name = tensor<string, []>("op_541_cast_fp16")];
+            tensor<string, []> var_543_equation_0 = const()[name = tensor<string, []>("op_543_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_543_cast_fp16 = einsum(equation = var_543_equation_0, values = (var_349_cast_fp16, var_255_cast_fp16))[name = tensor<string, []>("op_543_cast_fp16")];
+            tensor<fp16, []> var_544_to_fp16 = const()[name = tensor<string, []>("op_544_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_545_cast_fp16 = mul(x = var_543_cast_fp16, y = var_544_to_fp16)[name = tensor<string, []>("op_545_cast_fp16")];
+            tensor<string, []> var_547_equation_0 = const()[name = tensor<string, []>("op_547_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_547_cast_fp16 = einsum(equation = var_547_equation_0, values = (var_349_cast_fp16, var_259_cast_fp16))[name = tensor<string, []>("op_547_cast_fp16")];
+            tensor<fp16, []> var_548_to_fp16 = const()[name = tensor<string, []>("op_548_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_549_cast_fp16 = mul(x = var_547_cast_fp16, y = var_548_to_fp16)[name = tensor<string, []>("op_549_cast_fp16")];
+            tensor<string, []> var_551_equation_0 = const()[name = tensor<string, []>("op_551_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_551_cast_fp16 = einsum(equation = var_551_equation_0, values = (var_361_cast_fp16, var_263_cast_fp16))[name = tensor<string, []>("op_551_cast_fp16")];
+            tensor<fp16, []> var_552_to_fp16 = const()[name = tensor<string, []>("op_552_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_553_cast_fp16 = mul(x = var_551_cast_fp16, y = var_552_to_fp16)[name = tensor<string, []>("op_553_cast_fp16")];
+            tensor<string, []> var_555_equation_0 = const()[name = tensor<string, []>("op_555_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_555_cast_fp16 = einsum(equation = var_555_equation_0, values = (var_361_cast_fp16, var_267_cast_fp16))[name = tensor<string, []>("op_555_cast_fp16")];
+            tensor<fp16, []> var_556_to_fp16 = const()[name = tensor<string, []>("op_556_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_557_cast_fp16 = mul(x = var_555_cast_fp16, y = var_556_to_fp16)[name = tensor<string, []>("op_557_cast_fp16")];
+            tensor<string, []> var_559_equation_0 = const()[name = tensor<string, []>("op_559_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_559_cast_fp16 = einsum(equation = var_559_equation_0, values = (var_361_cast_fp16, var_271_cast_fp16))[name = tensor<string, []>("op_559_cast_fp16")];
+            tensor<fp16, []> var_560_to_fp16 = const()[name = tensor<string, []>("op_560_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_561_cast_fp16 = mul(x = var_559_cast_fp16, y = var_560_to_fp16)[name = tensor<string, []>("op_561_cast_fp16")];
+            tensor<string, []> var_563_equation_0 = const()[name = tensor<string, []>("op_563_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_563_cast_fp16 = einsum(equation = var_563_equation_0, values = (var_373_cast_fp16, var_275_cast_fp16))[name = tensor<string, []>("op_563_cast_fp16")];
+            tensor<fp16, []> var_564_to_fp16 = const()[name = tensor<string, []>("op_564_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_565_cast_fp16 = mul(x = var_563_cast_fp16, y = var_564_to_fp16)[name = tensor<string, []>("op_565_cast_fp16")];
+            tensor<string, []> var_567_equation_0 = const()[name = tensor<string, []>("op_567_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_567_cast_fp16 = einsum(equation = var_567_equation_0, values = (var_373_cast_fp16, var_279_cast_fp16))[name = tensor<string, []>("op_567_cast_fp16")];
+            tensor<fp16, []> var_568_to_fp16 = const()[name = tensor<string, []>("op_568_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_569_cast_fp16 = mul(x = var_567_cast_fp16, y = var_568_to_fp16)[name = tensor<string, []>("op_569_cast_fp16")];
+            tensor<string, []> var_571_equation_0 = const()[name = tensor<string, []>("op_571_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_571_cast_fp16 = einsum(equation = var_571_equation_0, values = (var_373_cast_fp16, var_283_cast_fp16))[name = tensor<string, []>("op_571_cast_fp16")];
+            tensor<fp16, []> var_572_to_fp16 = const()[name = tensor<string, []>("op_572_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_573_cast_fp16 = mul(x = var_571_cast_fp16, y = var_572_to_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_1_cast_fp16 = add(x = var_481_cast_fp16, y = mask)[name = tensor<string, []>("aw_1_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_3_cast_fp16 = add(x = var_485_cast_fp16, y = mask)[name = tensor<string, []>("aw_3_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_5_cast_fp16 = add(x = var_489_cast_fp16, y = mask)[name = tensor<string, []>("aw_5_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_7_cast_fp16 = add(x = var_493_cast_fp16, y = mask)[name = tensor<string, []>("aw_7_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_9_cast_fp16 = add(x = var_497_cast_fp16, y = mask)[name = tensor<string, []>("aw_9_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_11_cast_fp16 = add(x = var_501_cast_fp16, y = mask)[name = tensor<string, []>("aw_11_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_13_cast_fp16 = add(x = var_505_cast_fp16, y = mask)[name = tensor<string, []>("aw_13_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_15_cast_fp16 = add(x = var_509_cast_fp16, y = mask)[name = tensor<string, []>("aw_15_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_17_cast_fp16 = add(x = var_513_cast_fp16, y = mask)[name = tensor<string, []>("aw_17_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_19_cast_fp16 = add(x = var_517_cast_fp16, y = mask)[name = tensor<string, []>("aw_19_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_21_cast_fp16 = add(x = var_521_cast_fp16, y = mask)[name = tensor<string, []>("aw_21_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_23_cast_fp16 = add(x = var_525_cast_fp16, y = mask)[name = tensor<string, []>("aw_23_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_25_cast_fp16 = add(x = var_529_cast_fp16, y = mask)[name = tensor<string, []>("aw_25_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_27_cast_fp16 = add(x = var_533_cast_fp16, y = mask)[name = tensor<string, []>("aw_27_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_29_cast_fp16 = add(x = var_537_cast_fp16, y = mask)[name = tensor<string, []>("aw_29_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_31_cast_fp16 = add(x = var_541_cast_fp16, y = mask)[name = tensor<string, []>("aw_31_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_33_cast_fp16 = add(x = var_545_cast_fp16, y = mask)[name = tensor<string, []>("aw_33_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_35_cast_fp16 = add(x = var_549_cast_fp16, y = mask)[name = tensor<string, []>("aw_35_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_37_cast_fp16 = add(x = var_553_cast_fp16, y = mask)[name = tensor<string, []>("aw_37_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_39_cast_fp16 = add(x = var_557_cast_fp16, y = mask)[name = tensor<string, []>("aw_39_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_41_cast_fp16 = add(x = var_561_cast_fp16, y = mask)[name = tensor<string, []>("aw_41_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_43_cast_fp16 = add(x = var_565_cast_fp16, y = mask)[name = tensor<string, []>("aw_43_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_45_cast_fp16 = add(x = var_569_cast_fp16, y = mask)[name = tensor<string, []>("aw_45_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_47_cast_fp16 = add(x = var_573_cast_fp16, y = mask)[name = tensor<string, []>("aw_47_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_598_cast_fp16 = softmax(axis = var_52, x = aw_1_cast_fp16)[name = tensor<string, []>("op_598_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_599_cast_fp16 = softmax(axis = var_52, x = aw_3_cast_fp16)[name = tensor<string, []>("op_599_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_600_cast_fp16 = softmax(axis = var_52, x = aw_5_cast_fp16)[name = tensor<string, []>("op_600_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_601_cast_fp16 = softmax(axis = var_52, x = aw_7_cast_fp16)[name = tensor<string, []>("op_601_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_602_cast_fp16 = softmax(axis = var_52, x = aw_9_cast_fp16)[name = tensor<string, []>("op_602_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_603_cast_fp16 = softmax(axis = var_52, x = aw_11_cast_fp16)[name = tensor<string, []>("op_603_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_604_cast_fp16 = softmax(axis = var_52, x = aw_13_cast_fp16)[name = tensor<string, []>("op_604_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_605_cast_fp16 = softmax(axis = var_52, x = aw_15_cast_fp16)[name = tensor<string, []>("op_605_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_606_cast_fp16 = softmax(axis = var_52, x = aw_17_cast_fp16)[name = tensor<string, []>("op_606_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_607_cast_fp16 = softmax(axis = var_52, x = aw_19_cast_fp16)[name = tensor<string, []>("op_607_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_608_cast_fp16 = softmax(axis = var_52, x = aw_21_cast_fp16)[name = tensor<string, []>("op_608_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_609_cast_fp16 = softmax(axis = var_52, x = aw_23_cast_fp16)[name = tensor<string, []>("op_609_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_610_cast_fp16 = softmax(axis = var_52, x = aw_25_cast_fp16)[name = tensor<string, []>("op_610_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_611_cast_fp16 = softmax(axis = var_52, x = aw_27_cast_fp16)[name = tensor<string, []>("op_611_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_612_cast_fp16 = softmax(axis = var_52, x = aw_29_cast_fp16)[name = tensor<string, []>("op_612_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_613_cast_fp16 = softmax(axis = var_52, x = aw_31_cast_fp16)[name = tensor<string, []>("op_613_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_614_cast_fp16 = softmax(axis = var_52, x = aw_33_cast_fp16)[name = tensor<string, []>("op_614_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_615_cast_fp16 = softmax(axis = var_52, x = aw_35_cast_fp16)[name = tensor<string, []>("op_615_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_616_cast_fp16 = softmax(axis = var_52, x = aw_37_cast_fp16)[name = tensor<string, []>("op_616_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_617_cast_fp16 = softmax(axis = var_52, x = aw_39_cast_fp16)[name = tensor<string, []>("op_617_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_618_cast_fp16 = softmax(axis = var_52, x = aw_41_cast_fp16)[name = tensor<string, []>("op_618_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_619_cast_fp16 = softmax(axis = var_52, x = aw_43_cast_fp16)[name = tensor<string, []>("op_619_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_620_cast_fp16 = softmax(axis = var_52, x = aw_45_cast_fp16)[name = tensor<string, []>("op_620_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_621_cast_fp16 = softmax(axis = var_52, x = aw_47_cast_fp16)[name = tensor<string, []>("op_621_cast_fp16")];
+            tensor<string, []> var_623_equation_0 = const()[name = tensor<string, []>("op_623_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_623_cast_fp16 = einsum(equation = var_623_equation_0, values = (var_383_cast_fp16, var_598_cast_fp16))[name = tensor<string, []>("op_623_cast_fp16")];
+            tensor<string, []> var_625_equation_0 = const()[name = tensor<string, []>("op_625_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_625_cast_fp16 = einsum(equation = var_625_equation_0, values = (var_383_cast_fp16, var_599_cast_fp16))[name = tensor<string, []>("op_625_cast_fp16")];
+            tensor<string, []> var_627_equation_0 = const()[name = tensor<string, []>("op_627_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_627_cast_fp16 = einsum(equation = var_627_equation_0, values = (var_383_cast_fp16, var_600_cast_fp16))[name = tensor<string, []>("op_627_cast_fp16")];
+            tensor<string, []> var_629_equation_0 = const()[name = tensor<string, []>("op_629_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_629_cast_fp16 = einsum(equation = var_629_equation_0, values = (var_395_cast_fp16, var_601_cast_fp16))[name = tensor<string, []>("op_629_cast_fp16")];
+            tensor<string, []> var_631_equation_0 = const()[name = tensor<string, []>("op_631_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_631_cast_fp16 = einsum(equation = var_631_equation_0, values = (var_395_cast_fp16, var_602_cast_fp16))[name = tensor<string, []>("op_631_cast_fp16")];
+            tensor<string, []> var_633_equation_0 = const()[name = tensor<string, []>("op_633_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_633_cast_fp16 = einsum(equation = var_633_equation_0, values = (var_395_cast_fp16, var_603_cast_fp16))[name = tensor<string, []>("op_633_cast_fp16")];
+            tensor<string, []> var_635_equation_0 = const()[name = tensor<string, []>("op_635_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_635_cast_fp16 = einsum(equation = var_635_equation_0, values = (var_407_cast_fp16, var_604_cast_fp16))[name = tensor<string, []>("op_635_cast_fp16")];
+            tensor<string, []> var_637_equation_0 = const()[name = tensor<string, []>("op_637_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_637_cast_fp16 = einsum(equation = var_637_equation_0, values = (var_407_cast_fp16, var_605_cast_fp16))[name = tensor<string, []>("op_637_cast_fp16")];
+            tensor<string, []> var_639_equation_0 = const()[name = tensor<string, []>("op_639_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_639_cast_fp16 = einsum(equation = var_639_equation_0, values = (var_407_cast_fp16, var_606_cast_fp16))[name = tensor<string, []>("op_639_cast_fp16")];
+            tensor<string, []> var_641_equation_0 = const()[name = tensor<string, []>("op_641_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_641_cast_fp16 = einsum(equation = var_641_equation_0, values = (var_419_cast_fp16, var_607_cast_fp16))[name = tensor<string, []>("op_641_cast_fp16")];
+            tensor<string, []> var_643_equation_0 = const()[name = tensor<string, []>("op_643_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_643_cast_fp16 = einsum(equation = var_643_equation_0, values = (var_419_cast_fp16, var_608_cast_fp16))[name = tensor<string, []>("op_643_cast_fp16")];
+            tensor<string, []> var_645_equation_0 = const()[name = tensor<string, []>("op_645_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_645_cast_fp16 = einsum(equation = var_645_equation_0, values = (var_419_cast_fp16, var_609_cast_fp16))[name = tensor<string, []>("op_645_cast_fp16")];
+            tensor<string, []> var_647_equation_0 = const()[name = tensor<string, []>("op_647_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_647_cast_fp16 = einsum(equation = var_647_equation_0, values = (var_431_cast_fp16, var_610_cast_fp16))[name = tensor<string, []>("op_647_cast_fp16")];
+            tensor<string, []> var_649_equation_0 = const()[name = tensor<string, []>("op_649_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_649_cast_fp16 = einsum(equation = var_649_equation_0, values = (var_431_cast_fp16, var_611_cast_fp16))[name = tensor<string, []>("op_649_cast_fp16")];
+            tensor<string, []> var_651_equation_0 = const()[name = tensor<string, []>("op_651_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_651_cast_fp16 = einsum(equation = var_651_equation_0, values = (var_431_cast_fp16, var_612_cast_fp16))[name = tensor<string, []>("op_651_cast_fp16")];
+            tensor<string, []> var_653_equation_0 = const()[name = tensor<string, []>("op_653_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_653_cast_fp16 = einsum(equation = var_653_equation_0, values = (var_443_cast_fp16, var_613_cast_fp16))[name = tensor<string, []>("op_653_cast_fp16")];
+            tensor<string, []> var_655_equation_0 = const()[name = tensor<string, []>("op_655_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_655_cast_fp16 = einsum(equation = var_655_equation_0, values = (var_443_cast_fp16, var_614_cast_fp16))[name = tensor<string, []>("op_655_cast_fp16")];
+            tensor<string, []> var_657_equation_0 = const()[name = tensor<string, []>("op_657_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_657_cast_fp16 = einsum(equation = var_657_equation_0, values = (var_443_cast_fp16, var_615_cast_fp16))[name = tensor<string, []>("op_657_cast_fp16")];
+            tensor<string, []> var_659_equation_0 = const()[name = tensor<string, []>("op_659_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_659_cast_fp16 = einsum(equation = var_659_equation_0, values = (var_455_cast_fp16, var_616_cast_fp16))[name = tensor<string, []>("op_659_cast_fp16")];
+            tensor<string, []> var_661_equation_0 = const()[name = tensor<string, []>("op_661_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_661_cast_fp16 = einsum(equation = var_661_equation_0, values = (var_455_cast_fp16, var_617_cast_fp16))[name = tensor<string, []>("op_661_cast_fp16")];
+            tensor<string, []> var_663_equation_0 = const()[name = tensor<string, []>("op_663_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_663_cast_fp16 = einsum(equation = var_663_equation_0, values = (var_455_cast_fp16, var_618_cast_fp16))[name = tensor<string, []>("op_663_cast_fp16")];
+            tensor<string, []> var_665_equation_0 = const()[name = tensor<string, []>("op_665_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_665_cast_fp16 = einsum(equation = var_665_equation_0, values = (var_467_cast_fp16, var_619_cast_fp16))[name = tensor<string, []>("op_665_cast_fp16")];
+            tensor<string, []> var_667_equation_0 = const()[name = tensor<string, []>("op_667_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_667_cast_fp16 = einsum(equation = var_667_equation_0, values = (var_467_cast_fp16, var_620_cast_fp16))[name = tensor<string, []>("op_667_cast_fp16")];
+            tensor<string, []> var_669_equation_0 = const()[name = tensor<string, []>("op_669_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_669_cast_fp16 = einsum(equation = var_669_equation_0, values = (var_467_cast_fp16, var_621_cast_fp16))[name = tensor<string, []>("op_669_cast_fp16")];
+            tensor<bool, []> x_11_interleave_0 = const()[name = tensor<string, []>("x_11_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_11_cast_fp16 = concat(axis = var_52, interleave = x_11_interleave_0, values = (var_623_cast_fp16, var_625_cast_fp16, var_627_cast_fp16, var_629_cast_fp16, var_631_cast_fp16, var_633_cast_fp16, var_635_cast_fp16, var_637_cast_fp16, var_639_cast_fp16, var_641_cast_fp16, var_643_cast_fp16, var_645_cast_fp16, var_647_cast_fp16, var_649_cast_fp16, var_651_cast_fp16, var_653_cast_fp16, var_655_cast_fp16, var_657_cast_fp16, var_659_cast_fp16, var_661_cast_fp16, var_663_cast_fp16, var_665_cast_fp16, var_667_cast_fp16, var_669_cast_fp16))[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<int32, [4]> var_674 = const()[name = tensor<string, []>("op_674"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_3_cast_fp16 = reshape(shape = var_674, x = x_11_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [2]> var_677 = const()[name = tensor<string, []>("op_677"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_679 = const()[name = tensor<string, []>("op_679"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_1_pad_type_0 = const()[name = tensor<string, []>("attention_output_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_1_pad_0 = const()[name = tensor<string, []>("attention_output_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_0_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31463936)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_1_cast_fp16 = conv(dilations = var_679, groups = var_52, pad = attention_output_1_pad_0, pad_type = attention_output_1_pad_type_0, strides = var_677, weight = blocks_0_attn_proj_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("attention_output_1_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_13_cast_fp16 = add(x = attention_output_1_cast_fp16, y = x)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<bool, []> x_eps_3_interleave_0 = const()[name = tensor<string, []>("x_eps_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_3_to_fp16 = const()[name = tensor<string, []>("eps_chan_3_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338368)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_3_cast_fp16 = concat(axis = var_52, interleave = x_eps_3_interleave_0, values = (x_13_cast_fp16, eps_chan_3_to_fp16))[name = tensor<string, []>("x_eps_3_cast_fp16")];
+            tensor<int32, [1]> norm_x_3_axes_0 = const()[name = tensor<string, []>("norm_x_3_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_3_cast_fp16 = reduce_l2_norm(axes = norm_x_3_axes_0, keep_dims = var_55, x = x_eps_3_cast_fp16)[name = tensor<string, []>("norm_x_3_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_7_cast_fp16 = real_div(x = x_13_cast_fp16, y = norm_x_3_cast_fp16)[name = tensor<string, []>("x_normed_7_cast_fp16")];
+            tensor<fp16, []> var_705_to_fp16 = const()[name = tensor<string, []>("op_705_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_9_cast_fp16 = mul(x = x_normed_7_cast_fp16, y = var_705_to_fp16)[name = tensor<string, []>("x_normed_9_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_0_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50338560)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_5_cast_fp16 = mul(x = x_normed_9_cast_fp16, y = blocks_0_norm_2_weight_to_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> var_716 = const()[name = tensor<string, []>("op_716"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_718 = const()[name = tensor<string, []>("op_718"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_7_pad_type_0 = const()[name = tensor<string, []>("input_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_7_pad_0 = const()[name = tensor<string, []>("input_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50344768)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_7_cast_fp16 = conv(dilations = var_718, groups = var_52, pad = input_7_pad_0, pad_type = input_7_pad_type_0, strides = var_716, weight = blocks_0_mlp_fc_1_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<int32, [2]> var_722 = const()[name = tensor<string, []>("op_722"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_724 = const()[name = tensor<string, []>("op_724"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_1_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_1_pad_0 = const()[name = tensor<string, []>("x_fc_2_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_0_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100676480)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_1_cast_fp16 = conv(dilations = var_724, groups = var_52, pad = x_fc_2_1_pad_0, pad_type = x_fc_2_1_pad_type_0, strides = var_722, weight = blocks_0_mlp_fc_2_weight_to_fp16, x = input_5_cast_fp16)[name = tensor<string, []>("x_fc_2_1_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_727_cast_fp16 = silu(x = input_7_cast_fp16)[name = tensor<string, []>("op_727_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_9_cast_fp16 = mul(x = var_727_cast_fp16, y = x_fc_2_1_cast_fp16)[name = tensor<string, []>("input_9_cast_fp16")];
+            tensor<int32, [2]> var_730 = const()[name = tensor<string, []>("op_730"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_732 = const()[name = tensor<string, []>("op_732"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_734_pad_type_0 = const()[name = tensor<string, []>("op_734_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_734_pad_0 = const()[name = tensor<string, []>("op_734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_0_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_0_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151008192)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_734_cast_fp16 = conv(dilations = var_732, groups = var_52, pad = var_734_pad_0, pad_type = var_734_pad_type_0, strides = var_730, weight = blocks_0_mlp_proj_weight_to_fp16, x = input_9_cast_fp16)[name = tensor<string, []>("op_734_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_17_cast_fp16 = add(x = var_734_cast_fp16, y = x_13_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
+            tensor<int32, []> var_740 = const()[name = tensor<string, []>("op_740"), val = tensor<int32, []>(-1)];
+            tensor<int32, []> var_744 = const()[name = tensor<string, []>("op_744"), val = tensor<int32, []>(-2)];
+            tensor<int32, []> var_746 = const()[name = tensor<string, []>("op_746"), val = tensor<int32, []>(-3)];
+            tensor<int32, []> var_779 = const()[name = tensor<string, []>("op_779"), val = tensor<int32, []>(1)];
+            tensor<bool, []> var_782 = const()[name = tensor<string, []>("op_782"), val = tensor<bool, []>(true)];
+            tensor<bool, []> x_eps_5_interleave_0 = const()[name = tensor<string, []>("x_eps_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_5_to_fp16 = const()[name = tensor<string, []>("eps_chan_5_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201339904)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_5_cast_fp16 = concat(axis = var_779, interleave = x_eps_5_interleave_0, values = (x_17_cast_fp16, eps_chan_5_to_fp16))[name = tensor<string, []>("x_eps_5_cast_fp16")];
+            tensor<int32, [1]> norm_x_5_axes_0 = const()[name = tensor<string, []>("norm_x_5_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_5_cast_fp16 = reduce_l2_norm(axes = norm_x_5_axes_0, keep_dims = var_782, x = x_eps_5_cast_fp16)[name = tensor<string, []>("norm_x_5_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_13_cast_fp16 = real_div(x = x_17_cast_fp16, y = norm_x_5_cast_fp16)[name = tensor<string, []>("x_normed_13_cast_fp16")];
+            tensor<fp16, []> var_805_to_fp16 = const()[name = tensor<string, []>("op_805_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_15_cast_fp16 = mul(x = x_normed_13_cast_fp16, y = var_805_to_fp16)[name = tensor<string, []>("x_normed_15_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_1_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201340096)))];
+            tensor<fp16, [1, 3072, 8, 8]> x_21_cast_fp16 = mul(x = x_normed_15_cast_fp16, y = blocks_1_norm_1_weight_to_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
+            tensor<int32, [4]> var_829 = const()[name = tensor<string, []>("op_829"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> input_11_cast_fp16 = reshape(shape = var_829, x = x_21_cast_fp16)[name = tensor<string, []>("input_11_cast_fp16")];
+            tensor<int32, [2]> var_832 = const()[name = tensor<string, []>("op_832"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_834 = const()[name = tensor<string, []>("op_834"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> q_9_pad_type_0 = const()[name = tensor<string, []>("q_9_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> q_9_pad_0 = const()[name = tensor<string, []>("q_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_q_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_q_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201346304)))];
+            tensor<fp16, [1, 3072, 1, 64]> q_9_cast_fp16 = conv(dilations = var_834, groups = var_779, pad = q_9_pad_0, pad_type = q_9_pad_type_0, strides = var_832, weight = blocks_1_attn_q_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("q_9_cast_fp16")];
+            tensor<int32, [2]> var_838 = const()[name = tensor<string, []>("op_838"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_840 = const()[name = tensor<string, []>("op_840"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> k_13_pad_type_0 = const()[name = tensor<string, []>("k_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = tensor<string, []>("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_k_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_k_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220220736)))];
+            tensor<fp16, [1, 1024, 1, 64]> k_13_cast_fp16 = conv(dilations = var_840, groups = var_779, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = var_838, weight = blocks_1_attn_k_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("k_13_cast_fp16")];
+            tensor<int32, [2]> var_844 = const()[name = tensor<string, []>("op_844"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_846 = const()[name = tensor<string, []>("op_846"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> v_11_pad_type_0 = const()[name = tensor<string, []>("v_11_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = tensor<string, []>("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [1024, 3072, 1, 1]> blocks_1_attn_v_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_v_proj_weight_to_fp16"), val = tensor<fp16, [1024, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226512256)))];
+            tensor<fp16, [1, 1024, 1, 64]> v_11_cast_fp16 = conv(dilations = var_846, groups = var_779, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = var_844, weight = blocks_1_attn_v_proj_weight_to_fp16, x = input_11_cast_fp16)[name = tensor<string, []>("v_11_cast_fp16")];
+            tensor<int32, [4]> var_849 = const()[name = tensor<string, []>("op_849"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<fp16, [1, 24, 128, 64]> q_11_cast_fp16 = reshape(shape = var_849, x = q_9_cast_fp16)[name = tensor<string, []>("q_11_cast_fp16")];
+            tensor<int32, [4]> var_851 = const()[name = tensor<string, []>("op_851"), val = tensor<int32, [4]>([1, -1, 128, 64])];
+            tensor<fp16, [1, 8, 128, 64]> k_15_cast_fp16 = reshape(shape = var_851, x = k_13_cast_fp16)[name = tensor<string, []>("k_15_cast_fp16")];
+            tensor<int32, [4]> var_865_begin_0 = const()[name = tensor<string, []>("op_865_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_865_end_0 = const()[name = tensor<string, []>("op_865_end_0"), val = tensor<int32, [4]>([1, 24, 64, 64])];
+            tensor<bool, [4]> var_865_end_mask_0 = const()[name = tensor<string, []>("op_865_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_865_cast_fp16 = slice_by_index(begin = var_865_begin_0, end = var_865_end_0, end_mask = var_865_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_865_cast_fp16")];
+            tensor<int32, [4]> var_871_begin_0 = const()[name = tensor<string, []>("op_871_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_871_end_0 = const()[name = tensor<string, []>("op_871_end_0"), val = tensor<int32, [4]>([1, 24, 128, 64])];
+            tensor<bool, [4]> var_871_end_mask_0 = const()[name = tensor<string, []>("op_871_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 24, 64, 64]> var_871_cast_fp16 = slice_by_index(begin = var_871_begin_0, end = var_871_end_0, end_mask = var_871_end_mask_0, x = q_11_cast_fp16)[name = tensor<string, []>("op_871_cast_fp16")];
+            tensor<fp16, []> const_30_promoted_to_fp16 = const()[name = tensor<string, []>("const_30_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 24, 64, 64]> var_873_cast_fp16 = mul(x = var_871_cast_fp16, y = const_30_promoted_to_fp16)[name = tensor<string, []>("op_873_cast_fp16")];
+            tensor<bool, []> rotated_5_interleave_0 = const()[name = tensor<string, []>("rotated_5_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 24, 128, 64]> rotated_5_cast_fp16 = concat(axis = var_744, interleave = rotated_5_interleave_0, values = (var_873_cast_fp16, var_865_cast_fp16))[name = tensor<string, []>("rotated_5_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_876_cast_fp16 = mul(x = q_11_cast_fp16, y = cos)[name = tensor<string, []>("op_876_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> var_877_cast_fp16 = mul(x = rotated_5_cast_fp16, y = sin)[name = tensor<string, []>("op_877_cast_fp16")];
+            tensor<fp16, [1, 24, 128, 64]> roped_5_cast_fp16 = add(x = var_876_cast_fp16, y = var_877_cast_fp16)[name = tensor<string, []>("roped_5_cast_fp16")];
+            tensor<int32, [4]> var_890_begin_0 = const()[name = tensor<string, []>("op_890_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_890_end_0 = const()[name = tensor<string, []>("op_890_end_0"), val = tensor<int32, [4]>([1, 8, 64, 64])];
+            tensor<bool, [4]> var_890_end_mask_0 = const()[name = tensor<string, []>("op_890_end_mask_0"), val = tensor<bool, [4]>([true, true, false, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_890_cast_fp16 = slice_by_index(begin = var_890_begin_0, end = var_890_end_0, end_mask = var_890_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_890_cast_fp16")];
+            tensor<int32, [4]> var_896_begin_0 = const()[name = tensor<string, []>("op_896_begin_0"), val = tensor<int32, [4]>([0, 0, 64, 0])];
+            tensor<int32, [4]> var_896_end_0 = const()[name = tensor<string, []>("op_896_end_0"), val = tensor<int32, [4]>([1, 8, 128, 64])];
+            tensor<bool, [4]> var_896_end_mask_0 = const()[name = tensor<string, []>("op_896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 8, 64, 64]> var_896_cast_fp16 = slice_by_index(begin = var_896_begin_0, end = var_896_end_0, end_mask = var_896_end_mask_0, x = k_15_cast_fp16)[name = tensor<string, []>("op_896_cast_fp16")];
+            tensor<fp16, []> const_32_promoted_to_fp16 = const()[name = tensor<string, []>("const_32_promoted_to_fp16"), val = tensor<fp16, []>(-0x1p+0)];
+            tensor<fp16, [1, 8, 64, 64]> var_898_cast_fp16 = mul(x = var_896_cast_fp16, y = const_32_promoted_to_fp16)[name = tensor<string, []>("op_898_cast_fp16")];
+            tensor<bool, []> rotated_interleave_0 = const()[name = tensor<string, []>("rotated_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 8, 128, 64]> rotated_cast_fp16 = concat(axis = var_744, interleave = rotated_interleave_0, values = (var_898_cast_fp16, var_890_cast_fp16))[name = tensor<string, []>("rotated_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_901_cast_fp16 = mul(x = k_15_cast_fp16, y = cos)[name = tensor<string, []>("op_901_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> var_902_cast_fp16 = mul(x = rotated_cast_fp16, y = sin)[name = tensor<string, []>("op_902_cast_fp16")];
+            tensor<fp16, [1, 8, 128, 64]> roped_cast_fp16 = add(x = var_901_cast_fp16, y = var_902_cast_fp16)[name = tensor<string, []>("roped_cast_fp16")];
+            tensor<int32, [4]> var_905 = const()[name = tensor<string, []>("op_905"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> k_19_cast_fp16 = reshape(shape = var_905, x = roped_cast_fp16)[name = tensor<string, []>("k_19_cast_fp16")];
+            tensor<int32, [4]> var_907 = const()[name = tensor<string, []>("op_907"), val = tensor<int32, [4]>([1, -1, 1, 64])];
+            tensor<fp16, [1, 1024, 1, 64]> new_v_cache_1 = reshape(shape = var_907, x = v_11_cast_fp16)[name = tensor<string, []>("new_v_cache_1_type_fp32_cast_fp16")];
+            tensor<int32, [4]> k_21_perm_0 = const()[name = tensor<string, []>("k_21_perm_0"), val = tensor<int32, [4]>([0, -1, 2, -3])];
+            tensor<bool, []> k_interleave_0 = const()[name = tensor<string, []>("k_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 1, 1024]> new_k_cache_1 = transpose(perm = k_21_perm_0, x = k_19_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 512, 1, 1024]> k_cast_fp16 = concat(axis = var_746, interleave = k_interleave_0, values = (k_cache_1, new_k_cache_1))[name = tensor<string, []>("k_cast_fp16")];
+            tensor<bool, []> v_17_interleave_0 = const()[name = tensor<string, []>("v_17_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> v_17_cast_fp16 = concat(axis = var_740, interleave = v_17_interleave_0, values = (v_cache_1, new_v_cache_1))[name = tensor<string, []>("v_17_cast_fp16")];
+            tensor<int32, [4]> var_915 = const()[name = tensor<string, []>("op_915"), val = tensor<int32, [4]>([1, 3072, 1, -1])];
+            tensor<fp16, [1, 3072, 1, 64]> q_cast_fp16 = reshape(shape = var_915, x = roped_5_cast_fp16)[name = tensor<string, []>("q_cast_fp16")];
+            tensor<int32, [4]> var_920_begin_0 = const()[name = tensor<string, []>("op_920_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_920_end_0 = const()[name = tensor<string, []>("op_920_end_0"), val = tensor<int32, [4]>([1, 128, 1, 64])];
+            tensor<bool, [4]> var_920_end_mask_0 = const()[name = tensor<string, []>("op_920_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_920_cast_fp16 = slice_by_index(begin = var_920_begin_0, end = var_920_end_0, end_mask = var_920_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_920_cast_fp16")];
+            tensor<int32, [4]> var_924_begin_0 = const()[name = tensor<string, []>("op_924_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_924_end_0 = const()[name = tensor<string, []>("op_924_end_0"), val = tensor<int32, [4]>([1, 256, 1, 64])];
+            tensor<bool, [4]> var_924_end_mask_0 = const()[name = tensor<string, []>("op_924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_924_cast_fp16 = slice_by_index(begin = var_924_begin_0, end = var_924_end_0, end_mask = var_924_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_924_cast_fp16")];
+            tensor<int32, [4]> var_928_begin_0 = const()[name = tensor<string, []>("op_928_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_928_end_0 = const()[name = tensor<string, []>("op_928_end_0"), val = tensor<int32, [4]>([1, 384, 1, 64])];
+            tensor<bool, [4]> var_928_end_mask_0 = const()[name = tensor<string, []>("op_928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_928_cast_fp16 = slice_by_index(begin = var_928_begin_0, end = var_928_end_0, end_mask = var_928_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_928_cast_fp16")];
+            tensor<int32, [4]> var_932_begin_0 = const()[name = tensor<string, []>("op_932_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_932_end_0 = const()[name = tensor<string, []>("op_932_end_0"), val = tensor<int32, [4]>([1, 512, 1, 64])];
+            tensor<bool, [4]> var_932_end_mask_0 = const()[name = tensor<string, []>("op_932_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_932_cast_fp16 = slice_by_index(begin = var_932_begin_0, end = var_932_end_0, end_mask = var_932_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_932_cast_fp16")];
+            tensor<int32, [4]> var_936_begin_0 = const()[name = tensor<string, []>("op_936_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_936_end_0 = const()[name = tensor<string, []>("op_936_end_0"), val = tensor<int32, [4]>([1, 640, 1, 64])];
+            tensor<bool, [4]> var_936_end_mask_0 = const()[name = tensor<string, []>("op_936_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_936_cast_fp16 = slice_by_index(begin = var_936_begin_0, end = var_936_end_0, end_mask = var_936_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_936_cast_fp16")];
+            tensor<int32, [4]> var_940_begin_0 = const()[name = tensor<string, []>("op_940_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_940_end_0 = const()[name = tensor<string, []>("op_940_end_0"), val = tensor<int32, [4]>([1, 768, 1, 64])];
+            tensor<bool, [4]> var_940_end_mask_0 = const()[name = tensor<string, []>("op_940_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_940_cast_fp16 = slice_by_index(begin = var_940_begin_0, end = var_940_end_0, end_mask = var_940_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_940_cast_fp16")];
+            tensor<int32, [4]> var_944_begin_0 = const()[name = tensor<string, []>("op_944_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_944_end_0 = const()[name = tensor<string, []>("op_944_end_0"), val = tensor<int32, [4]>([1, 896, 1, 64])];
+            tensor<bool, [4]> var_944_end_mask_0 = const()[name = tensor<string, []>("op_944_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_944_cast_fp16 = slice_by_index(begin = var_944_begin_0, end = var_944_end_0, end_mask = var_944_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_944_cast_fp16")];
+            tensor<int32, [4]> var_948_begin_0 = const()[name = tensor<string, []>("op_948_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_948_end_0 = const()[name = tensor<string, []>("op_948_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 64])];
+            tensor<bool, [4]> var_948_end_mask_0 = const()[name = tensor<string, []>("op_948_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_948_cast_fp16 = slice_by_index(begin = var_948_begin_0, end = var_948_end_0, end_mask = var_948_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_948_cast_fp16")];
+            tensor<int32, [4]> var_952_begin_0 = const()[name = tensor<string, []>("op_952_begin_0"), val = tensor<int32, [4]>([0, 1024, 0, 0])];
+            tensor<int32, [4]> var_952_end_0 = const()[name = tensor<string, []>("op_952_end_0"), val = tensor<int32, [4]>([1, 1152, 1, 64])];
+            tensor<bool, [4]> var_952_end_mask_0 = const()[name = tensor<string, []>("op_952_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_952_cast_fp16 = slice_by_index(begin = var_952_begin_0, end = var_952_end_0, end_mask = var_952_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_952_cast_fp16")];
+            tensor<int32, [4]> var_956_begin_0 = const()[name = tensor<string, []>("op_956_begin_0"), val = tensor<int32, [4]>([0, 1152, 0, 0])];
+            tensor<int32, [4]> var_956_end_0 = const()[name = tensor<string, []>("op_956_end_0"), val = tensor<int32, [4]>([1, 1280, 1, 64])];
+            tensor<bool, [4]> var_956_end_mask_0 = const()[name = tensor<string, []>("op_956_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_956_cast_fp16 = slice_by_index(begin = var_956_begin_0, end = var_956_end_0, end_mask = var_956_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_956_cast_fp16")];
+            tensor<int32, [4]> var_960_begin_0 = const()[name = tensor<string, []>("op_960_begin_0"), val = tensor<int32, [4]>([0, 1280, 0, 0])];
+            tensor<int32, [4]> var_960_end_0 = const()[name = tensor<string, []>("op_960_end_0"), val = tensor<int32, [4]>([1, 1408, 1, 64])];
+            tensor<bool, [4]> var_960_end_mask_0 = const()[name = tensor<string, []>("op_960_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_960_cast_fp16 = slice_by_index(begin = var_960_begin_0, end = var_960_end_0, end_mask = var_960_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
+            tensor<int32, [4]> var_964_begin_0 = const()[name = tensor<string, []>("op_964_begin_0"), val = tensor<int32, [4]>([0, 1408, 0, 0])];
+            tensor<int32, [4]> var_964_end_0 = const()[name = tensor<string, []>("op_964_end_0"), val = tensor<int32, [4]>([1, 1536, 1, 64])];
+            tensor<bool, [4]> var_964_end_mask_0 = const()[name = tensor<string, []>("op_964_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_964_cast_fp16 = slice_by_index(begin = var_964_begin_0, end = var_964_end_0, end_mask = var_964_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_964_cast_fp16")];
+            tensor<int32, [4]> var_968_begin_0 = const()[name = tensor<string, []>("op_968_begin_0"), val = tensor<int32, [4]>([0, 1536, 0, 0])];
+            tensor<int32, [4]> var_968_end_0 = const()[name = tensor<string, []>("op_968_end_0"), val = tensor<int32, [4]>([1, 1664, 1, 64])];
+            tensor<bool, [4]> var_968_end_mask_0 = const()[name = tensor<string, []>("op_968_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_968_cast_fp16 = slice_by_index(begin = var_968_begin_0, end = var_968_end_0, end_mask = var_968_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_968_cast_fp16")];
+            tensor<int32, [4]> var_972_begin_0 = const()[name = tensor<string, []>("op_972_begin_0"), val = tensor<int32, [4]>([0, 1664, 0, 0])];
+            tensor<int32, [4]> var_972_end_0 = const()[name = tensor<string, []>("op_972_end_0"), val = tensor<int32, [4]>([1, 1792, 1, 64])];
+            tensor<bool, [4]> var_972_end_mask_0 = const()[name = tensor<string, []>("op_972_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_972_cast_fp16 = slice_by_index(begin = var_972_begin_0, end = var_972_end_0, end_mask = var_972_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_972_cast_fp16")];
+            tensor<int32, [4]> var_976_begin_0 = const()[name = tensor<string, []>("op_976_begin_0"), val = tensor<int32, [4]>([0, 1792, 0, 0])];
+            tensor<int32, [4]> var_976_end_0 = const()[name = tensor<string, []>("op_976_end_0"), val = tensor<int32, [4]>([1, 1920, 1, 64])];
+            tensor<bool, [4]> var_976_end_mask_0 = const()[name = tensor<string, []>("op_976_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_976_cast_fp16 = slice_by_index(begin = var_976_begin_0, end = var_976_end_0, end_mask = var_976_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_976_cast_fp16")];
+            tensor<int32, [4]> var_980_begin_0 = const()[name = tensor<string, []>("op_980_begin_0"), val = tensor<int32, [4]>([0, 1920, 0, 0])];
+            tensor<int32, [4]> var_980_end_0 = const()[name = tensor<string, []>("op_980_end_0"), val = tensor<int32, [4]>([1, 2048, 1, 64])];
+            tensor<bool, [4]> var_980_end_mask_0 = const()[name = tensor<string, []>("op_980_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_980_cast_fp16 = slice_by_index(begin = var_980_begin_0, end = var_980_end_0, end_mask = var_980_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_980_cast_fp16")];
+            tensor<int32, [4]> var_984_begin_0 = const()[name = tensor<string, []>("op_984_begin_0"), val = tensor<int32, [4]>([0, 2048, 0, 0])];
+            tensor<int32, [4]> var_984_end_0 = const()[name = tensor<string, []>("op_984_end_0"), val = tensor<int32, [4]>([1, 2176, 1, 64])];
+            tensor<bool, [4]> var_984_end_mask_0 = const()[name = tensor<string, []>("op_984_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_984_cast_fp16 = slice_by_index(begin = var_984_begin_0, end = var_984_end_0, end_mask = var_984_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_984_cast_fp16")];
+            tensor<int32, [4]> var_988_begin_0 = const()[name = tensor<string, []>("op_988_begin_0"), val = tensor<int32, [4]>([0, 2176, 0, 0])];
+            tensor<int32, [4]> var_988_end_0 = const()[name = tensor<string, []>("op_988_end_0"), val = tensor<int32, [4]>([1, 2304, 1, 64])];
+            tensor<bool, [4]> var_988_end_mask_0 = const()[name = tensor<string, []>("op_988_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_988_cast_fp16 = slice_by_index(begin = var_988_begin_0, end = var_988_end_0, end_mask = var_988_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_988_cast_fp16")];
+            tensor<int32, [4]> var_992_begin_0 = const()[name = tensor<string, []>("op_992_begin_0"), val = tensor<int32, [4]>([0, 2304, 0, 0])];
+            tensor<int32, [4]> var_992_end_0 = const()[name = tensor<string, []>("op_992_end_0"), val = tensor<int32, [4]>([1, 2432, 1, 64])];
+            tensor<bool, [4]> var_992_end_mask_0 = const()[name = tensor<string, []>("op_992_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_992_cast_fp16 = slice_by_index(begin = var_992_begin_0, end = var_992_end_0, end_mask = var_992_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_992_cast_fp16")];
+            tensor<int32, [4]> var_996_begin_0 = const()[name = tensor<string, []>("op_996_begin_0"), val = tensor<int32, [4]>([0, 2432, 0, 0])];
+            tensor<int32, [4]> var_996_end_0 = const()[name = tensor<string, []>("op_996_end_0"), val = tensor<int32, [4]>([1, 2560, 1, 64])];
+            tensor<bool, [4]> var_996_end_mask_0 = const()[name = tensor<string, []>("op_996_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_996_cast_fp16 = slice_by_index(begin = var_996_begin_0, end = var_996_end_0, end_mask = var_996_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_996_cast_fp16")];
+            tensor<int32, [4]> var_1000_begin_0 = const()[name = tensor<string, []>("op_1000_begin_0"), val = tensor<int32, [4]>([0, 2560, 0, 0])];
+            tensor<int32, [4]> var_1000_end_0 = const()[name = tensor<string, []>("op_1000_end_0"), val = tensor<int32, [4]>([1, 2688, 1, 64])];
+            tensor<bool, [4]> var_1000_end_mask_0 = const()[name = tensor<string, []>("op_1000_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1000_cast_fp16 = slice_by_index(begin = var_1000_begin_0, end = var_1000_end_0, end_mask = var_1000_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1000_cast_fp16")];
+            tensor<int32, [4]> var_1004_begin_0 = const()[name = tensor<string, []>("op_1004_begin_0"), val = tensor<int32, [4]>([0, 2688, 0, 0])];
+            tensor<int32, [4]> var_1004_end_0 = const()[name = tensor<string, []>("op_1004_end_0"), val = tensor<int32, [4]>([1, 2816, 1, 64])];
+            tensor<bool, [4]> var_1004_end_mask_0 = const()[name = tensor<string, []>("op_1004_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1004_cast_fp16 = slice_by_index(begin = var_1004_begin_0, end = var_1004_end_0, end_mask = var_1004_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1004_cast_fp16")];
+            tensor<int32, [4]> var_1008_begin_0 = const()[name = tensor<string, []>("op_1008_begin_0"), val = tensor<int32, [4]>([0, 2816, 0, 0])];
+            tensor<int32, [4]> var_1008_end_0 = const()[name = tensor<string, []>("op_1008_end_0"), val = tensor<int32, [4]>([1, 2944, 1, 64])];
+            tensor<bool, [4]> var_1008_end_mask_0 = const()[name = tensor<string, []>("op_1008_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1008_cast_fp16 = slice_by_index(begin = var_1008_begin_0, end = var_1008_end_0, end_mask = var_1008_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1008_cast_fp16")];
+            tensor<int32, [4]> var_1012_begin_0 = const()[name = tensor<string, []>("op_1012_begin_0"), val = tensor<int32, [4]>([0, 2944, 0, 0])];
+            tensor<int32, [4]> var_1012_end_0 = const()[name = tensor<string, []>("op_1012_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 64])];
+            tensor<bool, [4]> var_1012_end_mask_0 = const()[name = tensor<string, []>("op_1012_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 64]> var_1012_cast_fp16 = slice_by_index(begin = var_1012_begin_0, end = var_1012_end_0, end_mask = var_1012_end_mask_0, x = q_cast_fp16)[name = tensor<string, []>("op_1012_cast_fp16")];
+            tensor<int32, [4]> var_1018_begin_0 = const()[name = tensor<string, []>("op_1018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1018_end_0 = const()[name = tensor<string, []>("op_1018_end_0"), val = tensor<int32, [4]>([1, 512, 1, 128])];
+            tensor<bool, [4]> var_1018_end_mask_0 = const()[name = tensor<string, []>("op_1018_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1018_cast_fp16 = slice_by_index(begin = var_1018_begin_0, end = var_1018_end_0, end_mask = var_1018_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1018_cast_fp16")];
+            tensor<int32, [4]> var_1030_begin_0 = const()[name = tensor<string, []>("op_1030_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 128])];
+            tensor<int32, [4]> var_1030_end_0 = const()[name = tensor<string, []>("op_1030_end_0"), val = tensor<int32, [4]>([1, 512, 1, 256])];
+            tensor<bool, [4]> var_1030_end_mask_0 = const()[name = tensor<string, []>("op_1030_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1030_cast_fp16 = slice_by_index(begin = var_1030_begin_0, end = var_1030_end_0, end_mask = var_1030_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
+            tensor<int32, [4]> var_1042_begin_0 = const()[name = tensor<string, []>("op_1042_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 256])];
+            tensor<int32, [4]> var_1042_end_0 = const()[name = tensor<string, []>("op_1042_end_0"), val = tensor<int32, [4]>([1, 512, 1, 384])];
+            tensor<bool, [4]> var_1042_end_mask_0 = const()[name = tensor<string, []>("op_1042_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1042_cast_fp16 = slice_by_index(begin = var_1042_begin_0, end = var_1042_end_0, end_mask = var_1042_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1042_cast_fp16")];
+            tensor<int32, [4]> var_1054_begin_0 = const()[name = tensor<string, []>("op_1054_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 384])];
+            tensor<int32, [4]> var_1054_end_0 = const()[name = tensor<string, []>("op_1054_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1054_end_mask_0 = const()[name = tensor<string, []>("op_1054_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1054_cast_fp16 = slice_by_index(begin = var_1054_begin_0, end = var_1054_end_0, end_mask = var_1054_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1054_cast_fp16")];
+            tensor<int32, [4]> var_1066_begin_0 = const()[name = tensor<string, []>("op_1066_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 512])];
+            tensor<int32, [4]> var_1066_end_0 = const()[name = tensor<string, []>("op_1066_end_0"), val = tensor<int32, [4]>([1, 512, 1, 640])];
+            tensor<bool, [4]> var_1066_end_mask_0 = const()[name = tensor<string, []>("op_1066_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1066_cast_fp16 = slice_by_index(begin = var_1066_begin_0, end = var_1066_end_0, end_mask = var_1066_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1066_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = tensor<string, []>("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 640])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = tensor<string, []>("op_1078_end_0"), val = tensor<int32, [4]>([1, 512, 1, 768])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = tensor<string, []>("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1078_cast_fp16")];
+            tensor<int32, [4]> var_1090_begin_0 = const()[name = tensor<string, []>("op_1090_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 768])];
+            tensor<int32, [4]> var_1090_end_0 = const()[name = tensor<string, []>("op_1090_end_0"), val = tensor<int32, [4]>([1, 512, 1, 896])];
+            tensor<bool, [4]> var_1090_end_mask_0 = const()[name = tensor<string, []>("op_1090_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1090_cast_fp16 = slice_by_index(begin = var_1090_begin_0, end = var_1090_end_0, end_mask = var_1090_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1090_cast_fp16")];
+            tensor<int32, [4]> var_1102_begin_0 = const()[name = tensor<string, []>("op_1102_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 896])];
+            tensor<int32, [4]> var_1102_end_0 = const()[name = tensor<string, []>("op_1102_end_0"), val = tensor<int32, [4]>([1, 512, 1, 1024])];
+            tensor<bool, [4]> var_1102_end_mask_0 = const()[name = tensor<string, []>("op_1102_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 512, 1, 128]> var_1102_cast_fp16 = slice_by_index(begin = var_1102_begin_0, end = var_1102_end_0, end_mask = var_1102_end_mask_0, x = k_cast_fp16)[name = tensor<string, []>("op_1102_cast_fp16")];
+            tensor<int32, [4]> var_1112_begin_0 = const()[name = tensor<string, []>("op_1112_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1112_end_0 = const()[name = tensor<string, []>("op_1112_end_0"), val = tensor<int32, [4]>([1, 128, 1, 512])];
+            tensor<bool, [4]> var_1112_end_mask_0 = const()[name = tensor<string, []>("op_1112_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1112_cast_fp16 = slice_by_index(begin = var_1112_begin_0, end = var_1112_end_0, end_mask = var_1112_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1112_cast_fp16")];
+            tensor<int32, [4]> var_1124_begin_0 = const()[name = tensor<string, []>("op_1124_begin_0"), val = tensor<int32, [4]>([0, 128, 0, 0])];
+            tensor<int32, [4]> var_1124_end_0 = const()[name = tensor<string, []>("op_1124_end_0"), val = tensor<int32, [4]>([1, 256, 1, 512])];
+            tensor<bool, [4]> var_1124_end_mask_0 = const()[name = tensor<string, []>("op_1124_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1124_cast_fp16 = slice_by_index(begin = var_1124_begin_0, end = var_1124_end_0, end_mask = var_1124_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1124_cast_fp16")];
+            tensor<int32, [4]> var_1136_begin_0 = const()[name = tensor<string, []>("op_1136_begin_0"), val = tensor<int32, [4]>([0, 256, 0, 0])];
+            tensor<int32, [4]> var_1136_end_0 = const()[name = tensor<string, []>("op_1136_end_0"), val = tensor<int32, [4]>([1, 384, 1, 512])];
+            tensor<bool, [4]> var_1136_end_mask_0 = const()[name = tensor<string, []>("op_1136_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1136_cast_fp16 = slice_by_index(begin = var_1136_begin_0, end = var_1136_end_0, end_mask = var_1136_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1136_cast_fp16")];
+            tensor<int32, [4]> var_1148_begin_0 = const()[name = tensor<string, []>("op_1148_begin_0"), val = tensor<int32, [4]>([0, 384, 0, 0])];
+            tensor<int32, [4]> var_1148_end_0 = const()[name = tensor<string, []>("op_1148_end_0"), val = tensor<int32, [4]>([1, 512, 1, 512])];
+            tensor<bool, [4]> var_1148_end_mask_0 = const()[name = tensor<string, []>("op_1148_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1148_cast_fp16 = slice_by_index(begin = var_1148_begin_0, end = var_1148_end_0, end_mask = var_1148_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1148_cast_fp16")];
+            tensor<int32, [4]> var_1160_begin_0 = const()[name = tensor<string, []>("op_1160_begin_0"), val = tensor<int32, [4]>([0, 512, 0, 0])];
+            tensor<int32, [4]> var_1160_end_0 = const()[name = tensor<string, []>("op_1160_end_0"), val = tensor<int32, [4]>([1, 640, 1, 512])];
+            tensor<bool, [4]> var_1160_end_mask_0 = const()[name = tensor<string, []>("op_1160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1160_cast_fp16 = slice_by_index(begin = var_1160_begin_0, end = var_1160_end_0, end_mask = var_1160_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1160_cast_fp16")];
+            tensor<int32, [4]> var_1172_begin_0 = const()[name = tensor<string, []>("op_1172_begin_0"), val = tensor<int32, [4]>([0, 640, 0, 0])];
+            tensor<int32, [4]> var_1172_end_0 = const()[name = tensor<string, []>("op_1172_end_0"), val = tensor<int32, [4]>([1, 768, 1, 512])];
+            tensor<bool, [4]> var_1172_end_mask_0 = const()[name = tensor<string, []>("op_1172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1172_cast_fp16 = slice_by_index(begin = var_1172_begin_0, end = var_1172_end_0, end_mask = var_1172_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1172_cast_fp16")];
+            tensor<int32, [4]> var_1184_begin_0 = const()[name = tensor<string, []>("op_1184_begin_0"), val = tensor<int32, [4]>([0, 768, 0, 0])];
+            tensor<int32, [4]> var_1184_end_0 = const()[name = tensor<string, []>("op_1184_end_0"), val = tensor<int32, [4]>([1, 896, 1, 512])];
+            tensor<bool, [4]> var_1184_end_mask_0 = const()[name = tensor<string, []>("op_1184_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1184_cast_fp16 = slice_by_index(begin = var_1184_begin_0, end = var_1184_end_0, end_mask = var_1184_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1184_cast_fp16")];
+            tensor<int32, [4]> var_1196_begin_0 = const()[name = tensor<string, []>("op_1196_begin_0"), val = tensor<int32, [4]>([0, 896, 0, 0])];
+            tensor<int32, [4]> var_1196_end_0 = const()[name = tensor<string, []>("op_1196_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 512])];
+            tensor<bool, [4]> var_1196_end_mask_0 = const()[name = tensor<string, []>("op_1196_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 128, 1, 512]> var_1196_cast_fp16 = slice_by_index(begin = var_1196_begin_0, end = var_1196_end_0, end_mask = var_1196_end_mask_0, x = v_17_cast_fp16)[name = tensor<string, []>("op_1196_cast_fp16")];
+            tensor<string, []> var_1208_equation_0 = const()[name = tensor<string, []>("op_1208_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1208_cast_fp16 = einsum(equation = var_1208_equation_0, values = (var_1018_cast_fp16, var_920_cast_fp16))[name = tensor<string, []>("op_1208_cast_fp16")];
+            tensor<fp16, []> var_1209_to_fp16 = const()[name = tensor<string, []>("op_1209_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1210_cast_fp16 = mul(x = var_1208_cast_fp16, y = var_1209_to_fp16)[name = tensor<string, []>("op_1210_cast_fp16")];
+            tensor<string, []> var_1212_equation_0 = const()[name = tensor<string, []>("op_1212_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1212_cast_fp16 = einsum(equation = var_1212_equation_0, values = (var_1018_cast_fp16, var_924_cast_fp16))[name = tensor<string, []>("op_1212_cast_fp16")];
+            tensor<fp16, []> var_1213_to_fp16 = const()[name = tensor<string, []>("op_1213_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1214_cast_fp16 = mul(x = var_1212_cast_fp16, y = var_1213_to_fp16)[name = tensor<string, []>("op_1214_cast_fp16")];
+            tensor<string, []> var_1216_equation_0 = const()[name = tensor<string, []>("op_1216_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1216_cast_fp16 = einsum(equation = var_1216_equation_0, values = (var_1018_cast_fp16, var_928_cast_fp16))[name = tensor<string, []>("op_1216_cast_fp16")];
+            tensor<fp16, []> var_1217_to_fp16 = const()[name = tensor<string, []>("op_1217_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1218_cast_fp16 = mul(x = var_1216_cast_fp16, y = var_1217_to_fp16)[name = tensor<string, []>("op_1218_cast_fp16")];
+            tensor<string, []> var_1220_equation_0 = const()[name = tensor<string, []>("op_1220_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1220_cast_fp16 = einsum(equation = var_1220_equation_0, values = (var_1030_cast_fp16, var_932_cast_fp16))[name = tensor<string, []>("op_1220_cast_fp16")];
+            tensor<fp16, []> var_1221_to_fp16 = const()[name = tensor<string, []>("op_1221_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1222_cast_fp16 = mul(x = var_1220_cast_fp16, y = var_1221_to_fp16)[name = tensor<string, []>("op_1222_cast_fp16")];
+            tensor<string, []> var_1224_equation_0 = const()[name = tensor<string, []>("op_1224_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1224_cast_fp16 = einsum(equation = var_1224_equation_0, values = (var_1030_cast_fp16, var_936_cast_fp16))[name = tensor<string, []>("op_1224_cast_fp16")];
+            tensor<fp16, []> var_1225_to_fp16 = const()[name = tensor<string, []>("op_1225_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1226_cast_fp16 = mul(x = var_1224_cast_fp16, y = var_1225_to_fp16)[name = tensor<string, []>("op_1226_cast_fp16")];
+            tensor<string, []> var_1228_equation_0 = const()[name = tensor<string, []>("op_1228_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1228_cast_fp16 = einsum(equation = var_1228_equation_0, values = (var_1030_cast_fp16, var_940_cast_fp16))[name = tensor<string, []>("op_1228_cast_fp16")];
+            tensor<fp16, []> var_1229_to_fp16 = const()[name = tensor<string, []>("op_1229_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1230_cast_fp16 = mul(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = tensor<string, []>("op_1230_cast_fp16")];
+            tensor<string, []> var_1232_equation_0 = const()[name = tensor<string, []>("op_1232_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1232_cast_fp16 = einsum(equation = var_1232_equation_0, values = (var_1042_cast_fp16, var_944_cast_fp16))[name = tensor<string, []>("op_1232_cast_fp16")];
+            tensor<fp16, []> var_1233_to_fp16 = const()[name = tensor<string, []>("op_1233_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1234_cast_fp16 = mul(x = var_1232_cast_fp16, y = var_1233_to_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
+            tensor<string, []> var_1236_equation_0 = const()[name = tensor<string, []>("op_1236_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1236_cast_fp16 = einsum(equation = var_1236_equation_0, values = (var_1042_cast_fp16, var_948_cast_fp16))[name = tensor<string, []>("op_1236_cast_fp16")];
+            tensor<fp16, []> var_1237_to_fp16 = const()[name = tensor<string, []>("op_1237_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1238_cast_fp16 = mul(x = var_1236_cast_fp16, y = var_1237_to_fp16)[name = tensor<string, []>("op_1238_cast_fp16")];
+            tensor<string, []> var_1240_equation_0 = const()[name = tensor<string, []>("op_1240_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1240_cast_fp16 = einsum(equation = var_1240_equation_0, values = (var_1042_cast_fp16, var_952_cast_fp16))[name = tensor<string, []>("op_1240_cast_fp16")];
+            tensor<fp16, []> var_1241_to_fp16 = const()[name = tensor<string, []>("op_1241_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1242_cast_fp16 = mul(x = var_1240_cast_fp16, y = var_1241_to_fp16)[name = tensor<string, []>("op_1242_cast_fp16")];
+            tensor<string, []> var_1244_equation_0 = const()[name = tensor<string, []>("op_1244_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1244_cast_fp16 = einsum(equation = var_1244_equation_0, values = (var_1054_cast_fp16, var_956_cast_fp16))[name = tensor<string, []>("op_1244_cast_fp16")];
+            tensor<fp16, []> var_1245_to_fp16 = const()[name = tensor<string, []>("op_1245_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1246_cast_fp16 = mul(x = var_1244_cast_fp16, y = var_1245_to_fp16)[name = tensor<string, []>("op_1246_cast_fp16")];
+            tensor<string, []> var_1248_equation_0 = const()[name = tensor<string, []>("op_1248_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1248_cast_fp16 = einsum(equation = var_1248_equation_0, values = (var_1054_cast_fp16, var_960_cast_fp16))[name = tensor<string, []>("op_1248_cast_fp16")];
+            tensor<fp16, []> var_1249_to_fp16 = const()[name = tensor<string, []>("op_1249_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1250_cast_fp16 = mul(x = var_1248_cast_fp16, y = var_1249_to_fp16)[name = tensor<string, []>("op_1250_cast_fp16")];
+            tensor<string, []> var_1252_equation_0 = const()[name = tensor<string, []>("op_1252_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1252_cast_fp16 = einsum(equation = var_1252_equation_0, values = (var_1054_cast_fp16, var_964_cast_fp16))[name = tensor<string, []>("op_1252_cast_fp16")];
+            tensor<fp16, []> var_1253_to_fp16 = const()[name = tensor<string, []>("op_1253_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1254_cast_fp16 = mul(x = var_1252_cast_fp16, y = var_1253_to_fp16)[name = tensor<string, []>("op_1254_cast_fp16")];
+            tensor<string, []> var_1256_equation_0 = const()[name = tensor<string, []>("op_1256_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1256_cast_fp16 = einsum(equation = var_1256_equation_0, values = (var_1066_cast_fp16, var_968_cast_fp16))[name = tensor<string, []>("op_1256_cast_fp16")];
+            tensor<fp16, []> var_1257_to_fp16 = const()[name = tensor<string, []>("op_1257_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1258_cast_fp16 = mul(x = var_1256_cast_fp16, y = var_1257_to_fp16)[name = tensor<string, []>("op_1258_cast_fp16")];
+            tensor<string, []> var_1260_equation_0 = const()[name = tensor<string, []>("op_1260_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1260_cast_fp16 = einsum(equation = var_1260_equation_0, values = (var_1066_cast_fp16, var_972_cast_fp16))[name = tensor<string, []>("op_1260_cast_fp16")];
+            tensor<fp16, []> var_1261_to_fp16 = const()[name = tensor<string, []>("op_1261_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1262_cast_fp16 = mul(x = var_1260_cast_fp16, y = var_1261_to_fp16)[name = tensor<string, []>("op_1262_cast_fp16")];
+            tensor<string, []> var_1264_equation_0 = const()[name = tensor<string, []>("op_1264_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1264_cast_fp16 = einsum(equation = var_1264_equation_0, values = (var_1066_cast_fp16, var_976_cast_fp16))[name = tensor<string, []>("op_1264_cast_fp16")];
+            tensor<fp16, []> var_1265_to_fp16 = const()[name = tensor<string, []>("op_1265_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1266_cast_fp16 = mul(x = var_1264_cast_fp16, y = var_1265_to_fp16)[name = tensor<string, []>("op_1266_cast_fp16")];
+            tensor<string, []> var_1268_equation_0 = const()[name = tensor<string, []>("op_1268_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1268_cast_fp16 = einsum(equation = var_1268_equation_0, values = (var_1078_cast_fp16, var_980_cast_fp16))[name = tensor<string, []>("op_1268_cast_fp16")];
+            tensor<fp16, []> var_1269_to_fp16 = const()[name = tensor<string, []>("op_1269_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1270_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = tensor<string, []>("op_1270_cast_fp16")];
+            tensor<string, []> var_1272_equation_0 = const()[name = tensor<string, []>("op_1272_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1272_cast_fp16 = einsum(equation = var_1272_equation_0, values = (var_1078_cast_fp16, var_984_cast_fp16))[name = tensor<string, []>("op_1272_cast_fp16")];
+            tensor<fp16, []> var_1273_to_fp16 = const()[name = tensor<string, []>("op_1273_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1274_cast_fp16 = mul(x = var_1272_cast_fp16, y = var_1273_to_fp16)[name = tensor<string, []>("op_1274_cast_fp16")];
+            tensor<string, []> var_1276_equation_0 = const()[name = tensor<string, []>("op_1276_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1276_cast_fp16 = einsum(equation = var_1276_equation_0, values = (var_1078_cast_fp16, var_988_cast_fp16))[name = tensor<string, []>("op_1276_cast_fp16")];
+            tensor<fp16, []> var_1277_to_fp16 = const()[name = tensor<string, []>("op_1277_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1278_cast_fp16 = mul(x = var_1276_cast_fp16, y = var_1277_to_fp16)[name = tensor<string, []>("op_1278_cast_fp16")];
+            tensor<string, []> var_1280_equation_0 = const()[name = tensor<string, []>("op_1280_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1280_cast_fp16 = einsum(equation = var_1280_equation_0, values = (var_1090_cast_fp16, var_992_cast_fp16))[name = tensor<string, []>("op_1280_cast_fp16")];
+            tensor<fp16, []> var_1281_to_fp16 = const()[name = tensor<string, []>("op_1281_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1282_cast_fp16 = mul(x = var_1280_cast_fp16, y = var_1281_to_fp16)[name = tensor<string, []>("op_1282_cast_fp16")];
+            tensor<string, []> var_1284_equation_0 = const()[name = tensor<string, []>("op_1284_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1284_cast_fp16 = einsum(equation = var_1284_equation_0, values = (var_1090_cast_fp16, var_996_cast_fp16))[name = tensor<string, []>("op_1284_cast_fp16")];
+            tensor<fp16, []> var_1285_to_fp16 = const()[name = tensor<string, []>("op_1285_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1286_cast_fp16 = mul(x = var_1284_cast_fp16, y = var_1285_to_fp16)[name = tensor<string, []>("op_1286_cast_fp16")];
+            tensor<string, []> var_1288_equation_0 = const()[name = tensor<string, []>("op_1288_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1288_cast_fp16 = einsum(equation = var_1288_equation_0, values = (var_1090_cast_fp16, var_1000_cast_fp16))[name = tensor<string, []>("op_1288_cast_fp16")];
+            tensor<fp16, []> var_1289_to_fp16 = const()[name = tensor<string, []>("op_1289_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1290_cast_fp16 = mul(x = var_1288_cast_fp16, y = var_1289_to_fp16)[name = tensor<string, []>("op_1290_cast_fp16")];
+            tensor<string, []> var_1292_equation_0 = const()[name = tensor<string, []>("op_1292_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1292_cast_fp16 = einsum(equation = var_1292_equation_0, values = (var_1102_cast_fp16, var_1004_cast_fp16))[name = tensor<string, []>("op_1292_cast_fp16")];
+            tensor<fp16, []> var_1293_to_fp16 = const()[name = tensor<string, []>("op_1293_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1294_cast_fp16 = mul(x = var_1292_cast_fp16, y = var_1293_to_fp16)[name = tensor<string, []>("op_1294_cast_fp16")];
+            tensor<string, []> var_1296_equation_0 = const()[name = tensor<string, []>("op_1296_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1296_cast_fp16 = einsum(equation = var_1296_equation_0, values = (var_1102_cast_fp16, var_1008_cast_fp16))[name = tensor<string, []>("op_1296_cast_fp16")];
+            tensor<fp16, []> var_1297_to_fp16 = const()[name = tensor<string, []>("op_1297_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1298_cast_fp16 = mul(x = var_1296_cast_fp16, y = var_1297_to_fp16)[name = tensor<string, []>("op_1298_cast_fp16")];
+            tensor<string, []> var_1300_equation_0 = const()[name = tensor<string, []>("op_1300_equation_0"), val = tensor<string, []>("bkhc,bchq->bkhq")];
+            tensor<fp16, [1, 512, 1, 64]> var_1300_cast_fp16 = einsum(equation = var_1300_equation_0, values = (var_1102_cast_fp16, var_1012_cast_fp16))[name = tensor<string, []>("op_1300_cast_fp16")];
+            tensor<fp16, []> var_1301_to_fp16 = const()[name = tensor<string, []>("op_1301_to_fp16"), val = tensor<fp16, []>(0x1.6ap-4)];
+            tensor<fp16, [1, 512, 1, 64]> var_1302_cast_fp16 = mul(x = var_1300_cast_fp16, y = var_1301_to_fp16)[name = tensor<string, []>("op_1302_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_49_cast_fp16 = add(x = var_1210_cast_fp16, y = mask)[name = tensor<string, []>("aw_49_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_51_cast_fp16 = add(x = var_1214_cast_fp16, y = mask)[name = tensor<string, []>("aw_51_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_53_cast_fp16 = add(x = var_1218_cast_fp16, y = mask)[name = tensor<string, []>("aw_53_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_55_cast_fp16 = add(x = var_1222_cast_fp16, y = mask)[name = tensor<string, []>("aw_55_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_57_cast_fp16 = add(x = var_1226_cast_fp16, y = mask)[name = tensor<string, []>("aw_57_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_59_cast_fp16 = add(x = var_1230_cast_fp16, y = mask)[name = tensor<string, []>("aw_59_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_61_cast_fp16 = add(x = var_1234_cast_fp16, y = mask)[name = tensor<string, []>("aw_61_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_63_cast_fp16 = add(x = var_1238_cast_fp16, y = mask)[name = tensor<string, []>("aw_63_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_65_cast_fp16 = add(x = var_1242_cast_fp16, y = mask)[name = tensor<string, []>("aw_65_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_67_cast_fp16 = add(x = var_1246_cast_fp16, y = mask)[name = tensor<string, []>("aw_67_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_69_cast_fp16 = add(x = var_1250_cast_fp16, y = mask)[name = tensor<string, []>("aw_69_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_71_cast_fp16 = add(x = var_1254_cast_fp16, y = mask)[name = tensor<string, []>("aw_71_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_73_cast_fp16 = add(x = var_1258_cast_fp16, y = mask)[name = tensor<string, []>("aw_73_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_75_cast_fp16 = add(x = var_1262_cast_fp16, y = mask)[name = tensor<string, []>("aw_75_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_77_cast_fp16 = add(x = var_1266_cast_fp16, y = mask)[name = tensor<string, []>("aw_77_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_79_cast_fp16 = add(x = var_1270_cast_fp16, y = mask)[name = tensor<string, []>("aw_79_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_81_cast_fp16 = add(x = var_1274_cast_fp16, y = mask)[name = tensor<string, []>("aw_81_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_83_cast_fp16 = add(x = var_1278_cast_fp16, y = mask)[name = tensor<string, []>("aw_83_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_85_cast_fp16 = add(x = var_1282_cast_fp16, y = mask)[name = tensor<string, []>("aw_85_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_87_cast_fp16 = add(x = var_1286_cast_fp16, y = mask)[name = tensor<string, []>("aw_87_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_89_cast_fp16 = add(x = var_1290_cast_fp16, y = mask)[name = tensor<string, []>("aw_89_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_91_cast_fp16 = add(x = var_1294_cast_fp16, y = mask)[name = tensor<string, []>("aw_91_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_93_cast_fp16 = add(x = var_1298_cast_fp16, y = mask)[name = tensor<string, []>("aw_93_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> aw_cast_fp16 = add(x = var_1302_cast_fp16, y = mask)[name = tensor<string, []>("aw_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1327_cast_fp16 = softmax(axis = var_779, x = aw_49_cast_fp16)[name = tensor<string, []>("op_1327_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1328_cast_fp16 = softmax(axis = var_779, x = aw_51_cast_fp16)[name = tensor<string, []>("op_1328_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1329_cast_fp16 = softmax(axis = var_779, x = aw_53_cast_fp16)[name = tensor<string, []>("op_1329_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1330_cast_fp16 = softmax(axis = var_779, x = aw_55_cast_fp16)[name = tensor<string, []>("op_1330_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1331_cast_fp16 = softmax(axis = var_779, x = aw_57_cast_fp16)[name = tensor<string, []>("op_1331_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1332_cast_fp16 = softmax(axis = var_779, x = aw_59_cast_fp16)[name = tensor<string, []>("op_1332_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1333_cast_fp16 = softmax(axis = var_779, x = aw_61_cast_fp16)[name = tensor<string, []>("op_1333_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1334_cast_fp16 = softmax(axis = var_779, x = aw_63_cast_fp16)[name = tensor<string, []>("op_1334_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1335_cast_fp16 = softmax(axis = var_779, x = aw_65_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1336_cast_fp16 = softmax(axis = var_779, x = aw_67_cast_fp16)[name = tensor<string, []>("op_1336_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1337_cast_fp16 = softmax(axis = var_779, x = aw_69_cast_fp16)[name = tensor<string, []>("op_1337_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1338_cast_fp16 = softmax(axis = var_779, x = aw_71_cast_fp16)[name = tensor<string, []>("op_1338_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1339_cast_fp16 = softmax(axis = var_779, x = aw_73_cast_fp16)[name = tensor<string, []>("op_1339_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1340_cast_fp16 = softmax(axis = var_779, x = aw_75_cast_fp16)[name = tensor<string, []>("op_1340_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1341_cast_fp16 = softmax(axis = var_779, x = aw_77_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1342_cast_fp16 = softmax(axis = var_779, x = aw_79_cast_fp16)[name = tensor<string, []>("op_1342_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1343_cast_fp16 = softmax(axis = var_779, x = aw_81_cast_fp16)[name = tensor<string, []>("op_1343_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1344_cast_fp16 = softmax(axis = var_779, x = aw_83_cast_fp16)[name = tensor<string, []>("op_1344_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1345_cast_fp16 = softmax(axis = var_779, x = aw_85_cast_fp16)[name = tensor<string, []>("op_1345_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1346_cast_fp16 = softmax(axis = var_779, x = aw_87_cast_fp16)[name = tensor<string, []>("op_1346_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1347_cast_fp16 = softmax(axis = var_779, x = aw_89_cast_fp16)[name = tensor<string, []>("op_1347_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1348_cast_fp16 = softmax(axis = var_779, x = aw_91_cast_fp16)[name = tensor<string, []>("op_1348_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1349_cast_fp16 = softmax(axis = var_779, x = aw_93_cast_fp16)[name = tensor<string, []>("op_1349_cast_fp16")];
+            tensor<fp16, [1, 512, 1, 64]> var_1350_cast_fp16 = softmax(axis = var_779, x = aw_cast_fp16)[name = tensor<string, []>("op_1350_cast_fp16")];
+            tensor<string, []> var_1352_equation_0 = const()[name = tensor<string, []>("op_1352_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1352_cast_fp16 = einsum(equation = var_1352_equation_0, values = (var_1112_cast_fp16, var_1327_cast_fp16))[name = tensor<string, []>("op_1352_cast_fp16")];
+            tensor<string, []> var_1354_equation_0 = const()[name = tensor<string, []>("op_1354_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1354_cast_fp16 = einsum(equation = var_1354_equation_0, values = (var_1112_cast_fp16, var_1328_cast_fp16))[name = tensor<string, []>("op_1354_cast_fp16")];
+            tensor<string, []> var_1356_equation_0 = const()[name = tensor<string, []>("op_1356_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1356_cast_fp16 = einsum(equation = var_1356_equation_0, values = (var_1112_cast_fp16, var_1329_cast_fp16))[name = tensor<string, []>("op_1356_cast_fp16")];
+            tensor<string, []> var_1358_equation_0 = const()[name = tensor<string, []>("op_1358_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1358_cast_fp16 = einsum(equation = var_1358_equation_0, values = (var_1124_cast_fp16, var_1330_cast_fp16))[name = tensor<string, []>("op_1358_cast_fp16")];
+            tensor<string, []> var_1360_equation_0 = const()[name = tensor<string, []>("op_1360_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1360_cast_fp16 = einsum(equation = var_1360_equation_0, values = (var_1124_cast_fp16, var_1331_cast_fp16))[name = tensor<string, []>("op_1360_cast_fp16")];
+            tensor<string, []> var_1362_equation_0 = const()[name = tensor<string, []>("op_1362_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1362_cast_fp16 = einsum(equation = var_1362_equation_0, values = (var_1124_cast_fp16, var_1332_cast_fp16))[name = tensor<string, []>("op_1362_cast_fp16")];
+            tensor<string, []> var_1364_equation_0 = const()[name = tensor<string, []>("op_1364_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1364_cast_fp16 = einsum(equation = var_1364_equation_0, values = (var_1136_cast_fp16, var_1333_cast_fp16))[name = tensor<string, []>("op_1364_cast_fp16")];
+            tensor<string, []> var_1366_equation_0 = const()[name = tensor<string, []>("op_1366_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1366_cast_fp16 = einsum(equation = var_1366_equation_0, values = (var_1136_cast_fp16, var_1334_cast_fp16))[name = tensor<string, []>("op_1366_cast_fp16")];
+            tensor<string, []> var_1368_equation_0 = const()[name = tensor<string, []>("op_1368_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1368_cast_fp16 = einsum(equation = var_1368_equation_0, values = (var_1136_cast_fp16, var_1335_cast_fp16))[name = tensor<string, []>("op_1368_cast_fp16")];
+            tensor<string, []> var_1370_equation_0 = const()[name = tensor<string, []>("op_1370_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1370_cast_fp16 = einsum(equation = var_1370_equation_0, values = (var_1148_cast_fp16, var_1336_cast_fp16))[name = tensor<string, []>("op_1370_cast_fp16")];
+            tensor<string, []> var_1372_equation_0 = const()[name = tensor<string, []>("op_1372_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1372_cast_fp16 = einsum(equation = var_1372_equation_0, values = (var_1148_cast_fp16, var_1337_cast_fp16))[name = tensor<string, []>("op_1372_cast_fp16")];
+            tensor<string, []> var_1374_equation_0 = const()[name = tensor<string, []>("op_1374_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1374_cast_fp16 = einsum(equation = var_1374_equation_0, values = (var_1148_cast_fp16, var_1338_cast_fp16))[name = tensor<string, []>("op_1374_cast_fp16")];
+            tensor<string, []> var_1376_equation_0 = const()[name = tensor<string, []>("op_1376_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1376_cast_fp16 = einsum(equation = var_1376_equation_0, values = (var_1160_cast_fp16, var_1339_cast_fp16))[name = tensor<string, []>("op_1376_cast_fp16")];
+            tensor<string, []> var_1378_equation_0 = const()[name = tensor<string, []>("op_1378_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1378_cast_fp16 = einsum(equation = var_1378_equation_0, values = (var_1160_cast_fp16, var_1340_cast_fp16))[name = tensor<string, []>("op_1378_cast_fp16")];
+            tensor<string, []> var_1380_equation_0 = const()[name = tensor<string, []>("op_1380_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1380_cast_fp16 = einsum(equation = var_1380_equation_0, values = (var_1160_cast_fp16, var_1341_cast_fp16))[name = tensor<string, []>("op_1380_cast_fp16")];
+            tensor<string, []> var_1382_equation_0 = const()[name = tensor<string, []>("op_1382_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1382_cast_fp16 = einsum(equation = var_1382_equation_0, values = (var_1172_cast_fp16, var_1342_cast_fp16))[name = tensor<string, []>("op_1382_cast_fp16")];
+            tensor<string, []> var_1384_equation_0 = const()[name = tensor<string, []>("op_1384_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1384_cast_fp16 = einsum(equation = var_1384_equation_0, values = (var_1172_cast_fp16, var_1343_cast_fp16))[name = tensor<string, []>("op_1384_cast_fp16")];
+            tensor<string, []> var_1386_equation_0 = const()[name = tensor<string, []>("op_1386_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1386_cast_fp16 = einsum(equation = var_1386_equation_0, values = (var_1172_cast_fp16, var_1344_cast_fp16))[name = tensor<string, []>("op_1386_cast_fp16")];
+            tensor<string, []> var_1388_equation_0 = const()[name = tensor<string, []>("op_1388_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1388_cast_fp16 = einsum(equation = var_1388_equation_0, values = (var_1184_cast_fp16, var_1345_cast_fp16))[name = tensor<string, []>("op_1388_cast_fp16")];
+            tensor<string, []> var_1390_equation_0 = const()[name = tensor<string, []>("op_1390_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1390_cast_fp16 = einsum(equation = var_1390_equation_0, values = (var_1184_cast_fp16, var_1346_cast_fp16))[name = tensor<string, []>("op_1390_cast_fp16")];
+            tensor<string, []> var_1392_equation_0 = const()[name = tensor<string, []>("op_1392_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1392_cast_fp16 = einsum(equation = var_1392_equation_0, values = (var_1184_cast_fp16, var_1347_cast_fp16))[name = tensor<string, []>("op_1392_cast_fp16")];
+            tensor<string, []> var_1394_equation_0 = const()[name = tensor<string, []>("op_1394_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1394_cast_fp16 = einsum(equation = var_1394_equation_0, values = (var_1196_cast_fp16, var_1348_cast_fp16))[name = tensor<string, []>("op_1394_cast_fp16")];
+            tensor<string, []> var_1396_equation_0 = const()[name = tensor<string, []>("op_1396_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1396_cast_fp16 = einsum(equation = var_1396_equation_0, values = (var_1196_cast_fp16, var_1349_cast_fp16))[name = tensor<string, []>("op_1396_cast_fp16")];
+            tensor<string, []> var_1398_equation_0 = const()[name = tensor<string, []>("op_1398_equation_0"), val = tensor<string, []>("bchk,bkhq->bchq")];
+            tensor<fp16, [1, 128, 1, 64]> var_1398_cast_fp16 = einsum(equation = var_1398_equation_0, values = (var_1196_cast_fp16, var_1350_cast_fp16))[name = tensor<string, []>("op_1398_cast_fp16")];
+            tensor<bool, []> x_27_interleave_0 = const()[name = tensor<string, []>("x_27_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 3072, 1, 64]> x_27_cast_fp16 = concat(axis = var_779, interleave = x_27_interleave_0, values = (var_1352_cast_fp16, var_1354_cast_fp16, var_1356_cast_fp16, var_1358_cast_fp16, var_1360_cast_fp16, var_1362_cast_fp16, var_1364_cast_fp16, var_1366_cast_fp16, var_1368_cast_fp16, var_1370_cast_fp16, var_1372_cast_fp16, var_1374_cast_fp16, var_1376_cast_fp16, var_1378_cast_fp16, var_1380_cast_fp16, var_1382_cast_fp16, var_1384_cast_fp16, var_1386_cast_fp16, var_1388_cast_fp16, var_1390_cast_fp16, var_1392_cast_fp16, var_1394_cast_fp16, var_1396_cast_fp16, var_1398_cast_fp16))[name = tensor<string, []>("x_27_cast_fp16")];
+            tensor<int32, [4]> var_1403 = const()[name = tensor<string, []>("op_1403"), val = tensor<int32, [4]>([1, 3072, -1, 8])];
+            tensor<fp16, [1, 3072, 8, 8]> input_13_cast_fp16 = reshape(shape = var_1403, x = x_27_cast_fp16)[name = tensor<string, []>("input_13_cast_fp16")];
+            tensor<int32, [2]> var_1406 = const()[name = tensor<string, []>("op_1406"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1408 = const()[name = tensor<string, []>("op_1408"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> attention_output_pad_type_0 = const()[name = tensor<string, []>("attention_output_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> attention_output_pad_0 = const()[name = tensor<string, []>("attention_output_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 3072, 1, 1]> blocks_1_attn_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_attn_proj_weight_to_fp16"), val = tensor<fp16, [3072, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232803776)))];
+            tensor<fp16, [1, 3072, 8, 8]> attention_output_cast_fp16 = conv(dilations = var_1408, groups = var_779, pad = attention_output_pad_0, pad_type = attention_output_pad_type_0, strides = var_1406, weight = blocks_1_attn_proj_weight_to_fp16, x = input_13_cast_fp16)[name = tensor<string, []>("attention_output_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_29_cast_fp16 = add(x = attention_output_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
+            tensor<bool, []> x_eps_interleave_0 = const()[name = tensor<string, []>("x_eps_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1, 8, 8]> eps_chan_to_fp16 = const()[name = tensor<string, []>("eps_chan_to_fp16"), val = tensor<fp16, [1, 1, 8, 8]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678208)))];
+            tensor<fp16, [1, 3073, 8, 8]> x_eps_cast_fp16 = concat(axis = var_779, interleave = x_eps_interleave_0, values = (x_29_cast_fp16, eps_chan_to_fp16))[name = tensor<string, []>("x_eps_cast_fp16")];
+            tensor<int32, [1]> norm_x_axes_0 = const()[name = tensor<string, []>("norm_x_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 8, 8]> norm_x_cast_fp16 = reduce_l2_norm(axes = norm_x_axes_0, keep_dims = var_782, x = x_eps_cast_fp16)[name = tensor<string, []>("norm_x_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_19_cast_fp16 = real_div(x = x_29_cast_fp16, y = norm_x_cast_fp16)[name = tensor<string, []>("x_normed_19_cast_fp16")];
+            tensor<fp16, []> var_1434_to_fp16 = const()[name = tensor<string, []>("op_1434_to_fp16"), val = tensor<fp16, []>(0x1.bb8p+5)];
+            tensor<fp16, [1, 3072, 8, 8]> x_normed_21_cast_fp16 = mul(x = x_normed_19_cast_fp16, y = var_1434_to_fp16)[name = tensor<string, []>("x_normed_21_cast_fp16")];
+            tensor<fp16, [1, 3072, 1, 1]> blocks_1_norm_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_norm_2_weight_to_fp16"), val = tensor<fp16, [1, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251678400)))];
+            tensor<fp16, [1, 3072, 8, 8]> input_15_cast_fp16 = mul(x = x_normed_21_cast_fp16, y = blocks_1_norm_2_weight_to_fp16)[name = tensor<string, []>("input_15_cast_fp16")];
+            tensor<int32, [2]> var_1445 = const()[name = tensor<string, []>("op_1445"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1447 = const()[name = tensor<string, []>("op_1447"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> input_17_pad_type_0 = const()[name = tensor<string, []>("input_17_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> input_17_pad_0 = const()[name = tensor<string, []>("input_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_1_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_1_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251684608)))];
+            tensor<fp16, [1, 8192, 8, 8]> input_17_cast_fp16 = conv(dilations = var_1447, groups = var_779, pad = input_17_pad_0, pad_type = input_17_pad_type_0, strides = var_1445, weight = blocks_1_mlp_fc_1_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
+            tensor<int32, [2]> var_1451 = const()[name = tensor<string, []>("op_1451"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1453 = const()[name = tensor<string, []>("op_1453"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> x_fc_2_pad_type_0 = const()[name = tensor<string, []>("x_fc_2_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> x_fc_2_pad_0 = const()[name = tensor<string, []>("x_fc_2_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [8192, 3072, 1, 1]> blocks_1_mlp_fc_2_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_fc_2_weight_to_fp16"), val = tensor<fp16, [8192, 3072, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302016320)))];
+            tensor<fp16, [1, 8192, 8, 8]> x_fc_2_cast_fp16 = conv(dilations = var_1453, groups = var_779, pad = x_fc_2_pad_0, pad_type = x_fc_2_pad_type_0, strides = var_1451, weight = blocks_1_mlp_fc_2_weight_to_fp16, x = input_15_cast_fp16)[name = tensor<string, []>("x_fc_2_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> var_1456_cast_fp16 = silu(x = input_17_cast_fp16)[name = tensor<string, []>("op_1456_cast_fp16")];
+            tensor<fp16, [1, 8192, 8, 8]> input_cast_fp16 = mul(x = var_1456_cast_fp16, y = x_fc_2_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [2]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [2]> var_1461 = const()[name = tensor<string, []>("op_1461"), val = tensor<int32, [2]>([1, 1])];
+            tensor<string, []> var_1463_pad_type_0 = const()[name = tensor<string, []>("op_1463_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> var_1463_pad_0 = const()[name = tensor<string, []>("op_1463_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<fp16, [3072, 8192, 1, 1]> blocks_1_mlp_proj_weight_to_fp16 = const()[name = tensor<string, []>("blocks_1_mlp_proj_weight_to_fp16"), val = tensor<fp16, [3072, 8192, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(352348032)))];
+            tensor<fp16, [1, 3072, 8, 8]> var_1463_cast_fp16 = conv(dilations = var_1461, groups = var_779, pad = var_1463_pad_0, pad_type = var_1463_pad_type_0, strides = var_1459, weight = blocks_1_mlp_proj_weight_to_fp16, x = input_cast_fp16)[name = tensor<string, []>("op_1463_cast_fp16")];
+            tensor<fp16, [1, 3072, 8, 8]> new_x = add(x = var_1463_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("op_1464_cast_fp16")];
+        } -> (new_x, new_k_cache_0, new_v_cache_0, new_k_cache_1, new_v_cache_1);
+}
\ No newline at end of file
diff --git a/Llama-3.2-3B-Instruct_chunk9.mlmodelc/weights/weight.bin b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..62d28c7a01c5bae87fe29ec74613211c42ca660c
--- /dev/null
+++ b/Llama-3.2-3B-Instruct_chunk9.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22ea162c1647c03b50865bc80a45486ce5e1894f3a99303ffd2eb6130ddd03ea
+size 402679744
diff --git a/README.md b/README.md
index f56069a87407fbb7a92032413000c0bc73ba8e6f..e531936aeba496b6b6fcf20ef4293426582dc3ca 100644
--- a/README.md
+++ b/README.md
@@ -7,4 +7,7 @@ tags:
 ---
 CoreML conversion of Llama-3.2-3B-Instruct with a 512 context length. Optimized for Apple Neural Engine.
 
-Use [this CLI](https://github.com/smpanaro/coreml-llm-cli) to download and run inference. macOS 14 (Sonoma) is required.
\ No newline at end of file
+Use [this CLI](https://github.com/smpanaro/coreml-llm-cli) to download and run inference. macOS 14 (Sonoma) is required.
+
+> [!IMPORTANT]
+> This model will likley run slowly or not at all on M1 Macs and phones. Consider trying the 1B model for those devices: [smpanaro/Llama-3.2-1B-Instruct-CoreML](https://huggingface.co/smpanaro/Llama-3.2-1B-Instruct-CoreML)
diff --git a/cache-processor.mlmodelc/analytics/coremldata.bin b/cache-processor.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d9bd15c91118d80231cd2259c2b4c6c413d8a13f
--- /dev/null
+++ b/cache-processor.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8cf834d873b9ea34d7e6bb1e91b85769921caaefb423f8436aa4b9dd0df2e83
+size 243
diff --git a/cache-processor.mlmodelc/coremldata.bin b/cache-processor.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3f43a5cb71c8fa96309501d24c5f22312ac1829a
--- /dev/null
+++ b/cache-processor.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68d2e54606b35f52fddfec1f3deac45d4faf67b1cf1361355809e2a70b28c854
+size 516
diff --git a/cache-processor.mlmodelc/metadata.json b/cache-processor.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ed61ea98bb3055414244128b7df7b0cb5490cae
--- /dev/null
+++ b/cache-processor.mlmodelc/metadata.json
@@ -0,0 +1,109 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "updated_k_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "updated_v_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "ignore_me_im_only_here_so_this_runs_on_the_ane",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "SliceByIndex" : 2,
+      "Ios16.mul" : 1,
+      "Concat" : 2,
+      "Ios16.reduceMin" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 448 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 448, 1, 1024]",
+        "name" : "old_k_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 1 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 1, 1024]",
+        "name" : "new_k_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 448)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 448]",
+        "name" : "old_v_cache",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1024 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1024, 1, 64]",
+        "name" : "new_v_cache",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "cache_processor_l3_2_3b",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/cache-processor.mlmodelc/model.mil b/cache-processor.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..7b611aa68d184f0e9c256fd11c114a4d087f5db4
--- /dev/null
+++ b/cache-processor.mlmodelc/model.mil
@@ -0,0 +1,24 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [1, 64, 1, 1024]> new_k_cache, tensor<fp16, [1, 1024, 1, 64]> new_v_cache, tensor<fp16, [1, 448, 1, 1024]> old_k_cache, tensor<fp16, [1, 1024, 1, 448]> old_v_cache) {
+            tensor<int32, []> var_6 = const()[name = tensor<string, []>("op_6"), val = tensor<int32, []>(-3)];
+            tensor<bool, []> cat_k_1_interleave_0 = const()[name = tensor<string, []>("cat_k_1_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 512, 1, 1024]> cat_k_1_cast_fp16 = concat(axis = var_6, interleave = cat_k_1_interleave_0, values = (old_k_cache, new_k_cache))[name = tensor<string, []>("cat_k_1_cast_fp16")];
+            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> cat_v_interleave_0 = const()[name = tensor<string, []>("cat_v_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 1024, 1, 512]> cat_v_cast_fp16 = concat(axis = var_9, interleave = cat_v_interleave_0, values = (old_v_cache, new_v_cache))[name = tensor<string, []>("cat_v_cast_fp16")];
+            tensor<int32, [4]> var_20_begin_0 = const()[name = tensor<string, []>("op_20_begin_0"), val = tensor<int32, [4]>([0, 64, 0, 0])];
+            tensor<int32, [4]> var_20_end_0 = const()[name = tensor<string, []>("op_20_end_0"), val = tensor<int32, [4]>([1, 3072, 1, 1024])];
+            tensor<bool, [4]> var_20_end_mask_0 = const()[name = tensor<string, []>("op_20_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 448, 1, 1024]> updated_k_cache = slice_by_index(begin = var_20_begin_0, end = var_20_end_0, end_mask = var_20_end_mask_0, x = cat_k_1_cast_fp16)[name = tensor<string, []>("op_20_cast_fp16")];
+            tensor<int32, [4]> var_50_begin_0 = const()[name = tensor<string, []>("op_50_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 64])];
+            tensor<int32, [4]> var_50_end_0 = const()[name = tensor<string, []>("op_50_end_0"), val = tensor<int32, [4]>([1, 1024, 1, 3072])];
+            tensor<bool, [4]> var_50_end_mask_0 = const()[name = tensor<string, []>("op_50_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 1024, 1, 448]> updated_v_cache = slice_by_index(begin = var_50_begin_0, end = var_50_end_0, end_mask = var_50_end_mask_0, x = cat_v_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
+            tensor<fp16, []> var_51_promoted_to_fp16 = const()[name = tensor<string, []>("op_51_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
+            tensor<fp16, [1, 448, 1, 1024]> prod_cast_fp16 = mul(x = updated_k_cache, y = var_51_promoted_to_fp16)[name = tensor<string, []>("prod_cast_fp16")];
+            tensor<bool, []> var_53_keep_dims_0 = const()[name = tensor<string, []>("op_53_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, []> ignore_me_im_only_here_so_this_runs_on_the_ane = reduce_min(keep_dims = var_53_keep_dims_0, x = prod_cast_fp16)[name = tensor<string, []>("op_53_cast_fp16")];
+        } -> (updated_k_cache, updated_v_cache, ignore_me_im_only_here_so_this_runs_on_the_ane);
+}
\ No newline at end of file
diff --git a/logit-processor.mlmodelc/analytics/coremldata.bin b/logit-processor.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b2696a90e0dfd3e31204d90ad67fc865f4886cbe
--- /dev/null
+++ b/logit-processor.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cea8f79e82c95d93f772797047802fb88c7fc82dfcef790e69a2f274a104623
+size 243
diff --git a/logit-processor.mlmodelc/coremldata.bin b/logit-processor.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..243bcbd5aeec1943b455bc39f3c486bac5ad9601
--- /dev/null
+++ b/logit-processor.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d70f289c1552a24ba6ca721405ea0653ac7125a91297ea3d32b30363c2afd3c
+size 503
diff --git a/logit-processor.mlmodelc/metadata.json b/logit-processor.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..3caa579651bae1c29f067eda4f9c2fcf2dfb4c5a
--- /dev/null
+++ b/logit-processor.mlmodelc/metadata.json
@@ -0,0 +1,130 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 64]",
+        "name" : "argmax",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 7,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios16.add" : 7,
+      "Ios16.topk" : 9,
+      "Ios16.gatherAlongAxis" : 1,
+      "Concat" : 2,
+      "Squeeze" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "13.0",
+      "tvOS" : "16.0",
+      "visionOS" : "1.0",
+      "watchOS" : "9.0",
+      "iOS" : "16.0",
+      "macCatalyst" : "16.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.1.0",
+      "com.github.apple.coremltools.version" : "8.0b1"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_0",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_1",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_2",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_3",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_4",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_5",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 16384)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 16384]",
+        "name" : "logits_6",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 64 × 13568)",
+        "shortDescription" : "",
+        "shape" : "[1, 64, 13568]",
+        "name" : "logits_7",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "split_logit_processor",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/logit-processor.mlmodelc/model.mil b/logit-processor.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..17fc7abf6fab73f2ad1cb8147b0931606300c73a
--- /dev/null
+++ b/logit-processor.mlmodelc/model.mil
@@ -0,0 +1,84 @@
+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3304.5.2"}, {"coremlc-version", "3304.6.2"}, {"coremltools-component-torch", "2.1.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.0b1"}})]
+{
+    func main<ios16>(tensor<fp16, [1, 64, 16384]> logits_0, tensor<fp16, [1, 64, 16384]> logits_1, tensor<fp16, [1, 64, 16384]> logits_2, tensor<fp16, [1, 64, 16384]> logits_3, tensor<fp16, [1, 64, 16384]> logits_4, tensor<fp16, [1, 64, 16384]> logits_5, tensor<fp16, [1, 64, 16384]> logits_6, tensor<fp16, [1, 64, 13568]> logits_7) {
+            tensor<int32, [1]> chunk_size = const()[name = tensor<string, []>("chunk_size"), val = tensor<int32, [1]>([16384])];
+            tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_16_axis_0 = const()[name = tensor<string, []>("op_16_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_16_ascending_0 = const()[name = tensor<string, []>("op_16_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_16_sort_0 = const()[name = tensor<string, []>("op_16_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_16_return_indices_0 = const()[name = tensor<string, []>("op_16_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_16_cast_fp16_0, tensor<int32, [1, 64, 1]> var_16_cast_fp16_1 = topk(ascending = var_16_ascending_0, axis = var_16_axis_0, k = var_12, return_indices = var_16_return_indices_0, sort = var_16_sort_0, x = logits_0)[name = tensor<string, []>("op_16_cast_fp16")];
+            tensor<int32, []> var_22 = const()[name = tensor<string, []>("op_22"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_26_axis_0 = const()[name = tensor<string, []>("op_26_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_26_ascending_0 = const()[name = tensor<string, []>("op_26_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_26_sort_0 = const()[name = tensor<string, []>("op_26_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_26_return_indices_0 = const()[name = tensor<string, []>("op_26_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_26_cast_fp16_0, tensor<int32, [1, 64, 1]> var_26_cast_fp16_1 = topk(ascending = var_26_ascending_0, axis = var_26_axis_0, k = var_22, return_indices = var_26_return_indices_0, sort = var_26_sort_0, x = logits_1)[name = tensor<string, []>("op_26_cast_fp16")];
+            tensor<int32, [1, 64, 1]> var_31 = add(x = var_26_cast_fp16_1, y = chunk_size)[name = tensor<string, []>("op_31")];
+            tensor<int32, []> var_32 = const()[name = tensor<string, []>("op_32"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_36_axis_0 = const()[name = tensor<string, []>("op_36_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_36_ascending_0 = const()[name = tensor<string, []>("op_36_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_36_sort_0 = const()[name = tensor<string, []>("op_36_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_36_return_indices_0 = const()[name = tensor<string, []>("op_36_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_36_cast_fp16_0, tensor<int32, [1, 64, 1]> var_36_cast_fp16_1 = topk(ascending = var_36_ascending_0, axis = var_36_axis_0, k = var_32, return_indices = var_36_return_indices_0, sort = var_36_sort_0, x = logits_2)[name = tensor<string, []>("op_36_cast_fp16")];
+            tensor<int32, [1]> var_39 = const()[name = tensor<string, []>("op_39"), val = tensor<int32, [1]>([32768])];
+            tensor<int32, [1, 64, 1]> var_41 = add(x = var_36_cast_fp16_1, y = var_39)[name = tensor<string, []>("op_41")];
+            tensor<int32, []> var_42 = const()[name = tensor<string, []>("op_42"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_46_axis_0 = const()[name = tensor<string, []>("op_46_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_46_ascending_0 = const()[name = tensor<string, []>("op_46_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_46_sort_0 = const()[name = tensor<string, []>("op_46_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_46_return_indices_0 = const()[name = tensor<string, []>("op_46_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_46_cast_fp16_0, tensor<int32, [1, 64, 1]> var_46_cast_fp16_1 = topk(ascending = var_46_ascending_0, axis = var_46_axis_0, k = var_42, return_indices = var_46_return_indices_0, sort = var_46_sort_0, x = logits_3)[name = tensor<string, []>("op_46_cast_fp16")];
+            tensor<int32, [1]> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, [1]>([49152])];
+            tensor<int32, [1, 64, 1]> var_51 = add(x = var_46_cast_fp16_1, y = var_49)[name = tensor<string, []>("op_51")];
+            tensor<int32, []> var_52 = const()[name = tensor<string, []>("op_52"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_56_axis_0 = const()[name = tensor<string, []>("op_56_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_56_ascending_0 = const()[name = tensor<string, []>("op_56_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_56_sort_0 = const()[name = tensor<string, []>("op_56_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_56_return_indices_0 = const()[name = tensor<string, []>("op_56_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_56_cast_fp16_0, tensor<int32, [1, 64, 1]> var_56_cast_fp16_1 = topk(ascending = var_56_ascending_0, axis = var_56_axis_0, k = var_52, return_indices = var_56_return_indices_0, sort = var_56_sort_0, x = logits_4)[name = tensor<string, []>("op_56_cast_fp16")];
+            tensor<int32, [1]> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, [1]>([65536])];
+            tensor<int32, [1, 64, 1]> var_61 = add(x = var_56_cast_fp16_1, y = var_59)[name = tensor<string, []>("op_61")];
+            tensor<int32, []> var_62 = const()[name = tensor<string, []>("op_62"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_66_ascending_0 = const()[name = tensor<string, []>("op_66_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_66_sort_0 = const()[name = tensor<string, []>("op_66_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_66_return_indices_0 = const()[name = tensor<string, []>("op_66_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_66_cast_fp16_0, tensor<int32, [1, 64, 1]> var_66_cast_fp16_1 = topk(ascending = var_66_ascending_0, axis = var_66_axis_0, k = var_62, return_indices = var_66_return_indices_0, sort = var_66_sort_0, x = logits_5)[name = tensor<string, []>("op_66_cast_fp16")];
+            tensor<int32, [1]> var_69 = const()[name = tensor<string, []>("op_69"), val = tensor<int32, [1]>([81920])];
+            tensor<int32, [1, 64, 1]> var_71 = add(x = var_66_cast_fp16_1, y = var_69)[name = tensor<string, []>("op_71")];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_76_axis_0 = const()[name = tensor<string, []>("op_76_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_76_ascending_0 = const()[name = tensor<string, []>("op_76_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_76_sort_0 = const()[name = tensor<string, []>("op_76_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_76_return_indices_0 = const()[name = tensor<string, []>("op_76_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_76_cast_fp16_0, tensor<int32, [1, 64, 1]> var_76_cast_fp16_1 = topk(ascending = var_76_ascending_0, axis = var_76_axis_0, k = var_72, return_indices = var_76_return_indices_0, sort = var_76_sort_0, x = logits_6)[name = tensor<string, []>("op_76_cast_fp16")];
+            tensor<int32, [1]> var_79 = const()[name = tensor<string, []>("op_79"), val = tensor<int32, [1]>([98304])];
+            tensor<int32, [1, 64, 1]> var_81 = add(x = var_76_cast_fp16_1, y = var_79)[name = tensor<string, []>("op_81")];
+            tensor<int32, []> var_82 = const()[name = tensor<string, []>("op_82"), val = tensor<int32, []>(1)];
+            tensor<int32, []> cv_axis_0 = const()[name = tensor<string, []>("cv_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> cv_ascending_0 = const()[name = tensor<string, []>("cv_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> cv_sort_0 = const()[name = tensor<string, []>("cv_sort_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> cv_return_indices_0 = const()[name = tensor<string, []>("cv_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> cv_cast_fp16_0, tensor<int32, [1, 64, 1]> cv_cast_fp16_1 = topk(ascending = cv_ascending_0, axis = cv_axis_0, k = var_82, return_indices = cv_return_indices_0, sort = cv_sort_0, x = logits_7)[name = tensor<string, []>("cv_cast_fp16")];
+            tensor<int32, [1]> var_89 = const()[name = tensor<string, []>("op_89"), val = tensor<int32, [1]>([114688])];
+            tensor<int32, [1, 64, 1]> var_91 = add(x = cv_cast_fp16_1, y = var_89)[name = tensor<string, []>("op_91")];
+            tensor<int32, []> var_93 = const()[name = tensor<string, []>("op_93"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> values_interleave_0 = const()[name = tensor<string, []>("values_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 64, 8]> values_cast_fp16 = concat(axis = var_93, interleave = values_interleave_0, values = (var_16_cast_fp16_0, var_26_cast_fp16_0, var_36_cast_fp16_0, var_46_cast_fp16_0, var_56_cast_fp16_0, var_66_cast_fp16_0, var_76_cast_fp16_0, cv_cast_fp16_0))[name = tensor<string, []>("values_cast_fp16")];
+            tensor<int32, []> var_96 = const()[name = tensor<string, []>("op_96"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> indices_interleave_0 = const()[name = tensor<string, []>("indices_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<int32, [1, 64, 8]> indices = concat(axis = var_96, interleave = indices_interleave_0, values = (var_16_cast_fp16_1, var_31, var_41, var_51, var_61, var_71, var_81, var_91))[name = tensor<string, []>("indices")];
+            tensor<int32, []> var_98 = const()[name = tensor<string, []>("op_98"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_102_axis_0 = const()[name = tensor<string, []>("op_102_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_102_ascending_0 = const()[name = tensor<string, []>("op_102_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_102_sort_0 = const()[name = tensor<string, []>("op_102_sort_0"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_102_return_indices_0 = const()[name = tensor<string, []>("op_102_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<fp16, [1, 64, 1]> var_102_cast_fp16_0, tensor<int32, [1, 64, 1]> var_102_cast_fp16_1 = topk(ascending = var_102_ascending_0, axis = var_102_axis_0, k = var_98, return_indices = var_102_return_indices_0, sort = var_102_sort_0, x = values_cast_fp16)[name = tensor<string, []>("op_102_cast_fp16")];
+            tensor<int32, []> var_104 = const()[name = tensor<string, []>("op_104"), val = tensor<int32, []>(-1)];
+            tensor<int32, [1, 64, 1]> var_106 = gather_along_axis(axis = var_104, indices = var_102_cast_fp16_1, x = indices)[name = tensor<string, []>("op_106")];
+            tensor<int32, [1]> var_108_axes_0 = const()[name = tensor<string, []>("op_108_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 64]> argmax = squeeze(axes = var_108_axes_0, x = var_106)[name = tensor<string, []>("op_108")];
+        } -> (argmax);
+}
\ No newline at end of file