From 690da2dc7d5ecfc71bdabe5162f174d291146337 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 12:19:44 -0600 Subject: [PATCH 01/12] add more unit tests for npusim.backend.util and remove unused functions --- neusim/npusim/backend/tests/test_util.py | 121 +++++++++++++++++++++++ neusim/npusim/backend/util.py | 60 ----------- 2 files changed, 121 insertions(+), 60 deletions(-) diff --git a/neusim/npusim/backend/tests/test_util.py b/neusim/npusim/backend/tests/test_util.py index 7cdd045..998474d 100644 --- a/neusim/npusim/backend/tests/test_util.py +++ b/neusim/npusim/backend/tests/test_util.py @@ -1,7 +1,10 @@ from math import ceil import os import unittest +from unittest.mock import MagicMock, patch import neusim.npusim.backend.util as util_under_test +import neusim.npusim.frontend.Operator as Operator +import neusim.xla_hlo_parser.xla_hlo_structures as hlo_struct class TestNPUSimBackendUtil(unittest.TestCase): @@ -16,6 +19,8 @@ def test_get_size_bytes_from_dtype(self): ("int8", 1), ("int16", 2), ("int32", 4), + ("int64", 8), + ("float64", 8), ("BOOL", 1), ("DT_INT", 4), ("DT_FLOAT", 4), @@ -37,3 +42,119 @@ def test_get_factors(self): ] for n, expected_factors in test_cases: self.assertEqual(util_under_test.get_factors(n), expected_factors) + + def test_construct_hlo_instruction_from_node_cost(self): + # Mock node_cost as an Operator object + node_cost = MagicMock(spec=Operator.Operator) + # BatchMatMul requires rank 3 inputs: [Batch, M, K] and [Batch, K, N] -> [Batch, M, N] + node_cost.output_tensor_shape_str = "[f32:(1,16,64)]" + node_cost.input_tensor_shape_str = "f32:[1,16,128],f32:[1,128,64]" + node_cost.name = "test_op" + node_cost.config_str = "BatchMatMul(window={size=3x3 stride=1x1}, dim_labels=bmn->bn)" + + # Test basic construction + instruction = util_under_test.construct_hlo_instruction_from_node_cost(node_cost) + + self.assertIsInstance(instruction, hlo_struct.HLOInstruction) + self.assertEqual(instruction.result.name, "test_op") + self.assertEqual(instruction.result.type.shape, [1, 16, 64]) + self.assertEqual(len(instruction.operands), 2) + self.assertEqual(instruction.operands[0].type.shape, [1, 16, 128]) + self.assertEqual(instruction.operands[1].type.shape, [1, 128, 64]) + self.assertEqual(instruction.opcode, "convolution") + self.assertEqual(instruction.metadata["op_type"], "Einsum") + + # Test with dict input (should convert to Operator) + node_cost_dict = { + "output_tensor_shape_str": "[f32:(1,16,64)]", + "input_tensor_shape_str": "f32:[1,16,128],f32:[1,128,64]", + "name": "test_op_dict", + "config_str": "TestOp()", + "Fusion index": 0, "Description": "", "Config": "TestOp()", "Name": "test_op_dict", "OpType": "Other", + "Count": 1, "Bounded-by": "", "Execution time": 0, "Compute time": 0, "Memory time": 0, + "ICI/NVLink time": 0, "ICI/NVLink outbound traffic": 0, "ICI/NVLink inbound traffic": 0, + "Aggregated DCN time": 0, "PCIe time": 0, "MXU time": 0, "VPU time": 0, "Transpose time": 0, + "Permute time": 0, "Bytes accessed": 0, "Input Tensor Shapes": "f32:[1,16,128],f32:[1,128,64]", + "Output Tensor Shapes": "f32:(1,16,64)]", "FLOP Count": 0, "Op Name": "test_op_dict", "Op Code": "TestOp", "Weight Size": 0, + "parsed_op_type": "Other" + } + with patch('neusim.npusim.frontend.Operator.Operator.from_csv_dict') as mock_from_csv: + mock_op = MagicMock(spec=Operator.Operator) + mock_op.output_tensor_shape_str = "[f32:(1,16,64)]" + mock_op.input_tensor_shape_str = "f32:[1,16,128],f32:[1,128,64]" + mock_op.name = "test_op_dict" + mock_op.config_str = "TestOp()" + mock_from_csv.return_value = mock_op + + instruction = util_under_test.construct_hlo_instruction_from_node_cost(node_cost_dict) + self.assertEqual(instruction.result.name, "test_op_dict") + + # Test XlaEinsum + node_cost_einsum = MagicMock(spec=Operator.Operator) + node_cost_einsum.output_tensor_shape_str = "[f32:(1,1,16)]" + node_cost_einsum.input_tensor_shape_str = "f32:[1,1,128],f32:[128,16]" + node_cost_einsum.name = "einsum_op" + node_cost_einsum.config_str = "XlaEinsum(eq=bmk;kn->bmn, window={size=3x3})" + + instruction_einsum = util_under_test.construct_hlo_instruction_from_node_cost(node_cost_einsum) + self.assertEqual(instruction_einsum.opcode, "convolution") + self.assertEqual(instruction_einsum.metadata["op_type"], "Einsum") + # Check if axes were parsed + self.assertTrue(hasattr(instruction_einsum, "input_axes")) + self.assertTrue(hasattr(instruction_einsum, "output_axes")) + + # Test Conv2D + node_cost_conv = MagicMock(spec=Operator.Operator) + node_cost_conv.output_tensor_shape_str = "[f32:(1,32,32,16)]" + node_cost_conv.input_tensor_shape_str = "f32:[1,32,32,16]" + node_cost_conv.name = "conv_op" + node_cost_conv.config_str = "Conv2D(eq=b01f_01io->b01f, window={size=3x3 stride=1x1},)" + + instruction_conv = util_under_test.construct_hlo_instruction_from_node_cost(node_cost_conv) + self.assertEqual(instruction_conv.opcode, "convolution") + self.assertIn("dim_labels", instruction_conv.metadata) + self.assertIn("window", instruction_conv.metadata) + + def test_construct_hlo_module_from_node_costs(self): + node_costs = [] + for i in range(2): + mock_op = MagicMock(spec=Operator.Operator) + mock_op.output_tensor_shape_str = "[f32:(1,128)]" + mock_op.input_tensor_shape_str = "f32:[1,128]" + mock_op.name = f"op_{i}" + mock_op.config_str = "Op()" + node_costs.append(mock_op) + + module = util_under_test.construct_hlo_module_from_node_costs(node_costs, "test_module") + + self.assertIsInstance(module, hlo_struct.HLOModule) + self.assertEqual(module.name, "test_module") + self.assertEqual(len(module.ENTRY.instructions), 2) + self.assertEqual(module.ENTRY.instructions[0].result.name, "op_0") + self.assertEqual(module.ENTRY.instructions[1].result.name, "op_1") + + def test_get_total_execution_time_ns_from_ops(self): + mock_op1 = MagicMock() + mock_op1.stats.execution_time_ns = 100 + mock_op2 = MagicMock() + mock_op2.stats.execution_time_ns = 200 + + ops = [mock_op1, mock_op2] + total_time = util_under_test.get_total_execution_time_ns_from_ops(ops) + self.assertEqual(total_time, 300) + + def test_calculate_bandwidths(self): + # SA Bandwidth + # (128 + 128) * 2 * 8 * 1.75 = 256 * 16 * 1.75 = 4096 * 1.75 = 7168 + sa_bw = util_under_test.calculate_sa_bandwidth_GBps( + sa_input_width=128, sa_output_width=128, + data_type_size_bytes=2, num_sa=8, freq_GHz=1.75 + ) + self.assertAlmostEqual(sa_bw, 7168.0) + + # VPU Bandwidth + # 128 * 8 * 2 * 1.75 = 3584 + vpu_bw = util_under_test.calculate_vpu_bandwidth_GBps( + n_lanes=128, n_sublanes=8, n_ports=2, freq_GHz=1.75 + ) + self.assertAlmostEqual(vpu_bw, 3584.0) diff --git a/neusim/npusim/backend/util.py b/neusim/npusim/backend/util.py index 7091617..e93e333 100644 --- a/neusim/npusim/backend/util.py +++ b/neusim/npusim/backend/util.py @@ -165,66 +165,6 @@ def construct_hlo_module_from_node_costs(node_costs: Sequence[dict[str, Any] | O return module -def get_tfsim_node_costs(tfsim_dir: str, bn: str, bs: int, sa: int = 4, vu: int = 1) -> list[dict[str, Any]]: - if bn in ["llama13b"]: # llama2-13b from tf-sim analytical - node_cost_csv_path = os.path.join(tfsim_dir, f"xla_hlo_{bn}_{bs}", f"sa{sa}_vu{vu}", "JellyFish-TPU_Conv-Opt-4SA-4VU-LLaMA-13B-serving-fwd_bwd_ops.csv") - # elif bn in ["clip-vit", "vicuna13b", "seem", "lama", "gligen"]: # multimodal benchmarks - # node_cost_csv_path = os.path.join(tfsim_dir, f"xla_hlo_{bn}_{bs}", f"sa{sa}_vu{vu}", "cluster*", "node_costs.csv") - else: # other benchmarks from tf-sim - node_cost_csv_path = os.path.join(tfsim_dir, f"xla_hlo_{bn}_{bs}", f"sa{sa}_vu{vu}", "cluster*", "node_costs.csv") - nc_glob = glob.glob(node_cost_csv_path) - assert len(nc_glob) == 1, f"benchmark '{bn}.{bs}': node_costs.csv not found in directory {node_cost_csv_path}" - node_costs_file_path = nc_glob[0] - with open(node_costs_file_path, "r") as f: - reader = csv.DictReader(f) - node_costs = list(reader) - - # allocate new field names - def init_new_field(field_name, default_value = None): - if field_name not in nc: - nc[field_name] = default_value - for nc in node_costs: - init_new_field("parsed_op_type") - init_new_field("dim_labels") - init_new_field("tile_shapes") - init_new_field("num_tiles") - init_new_field("max_vmem_demand_bytes") - init_new_field("num_mxu_ops", 0) - init_new_field("einsum_B_size") - init_new_field("einsum_M_size") - init_new_field("einsum_N_size") - init_new_field("einsum_K_size") - - return node_costs - - -def get_top_level_node_op_name(node: dict[str, Any]) -> str: - '''return the op name of the top level node of @node''' - if node["Top Level Node"] == "True": - return node["Op Name"] - - name_dir = str(node["Op Name"]).split("/") - tln_name = name_dir[0] - - # hack for While node in transformer model - if "while" in tln_name: - return "While" - - return tln_name - - -def get_top_level_node(node: dict[str, Any], node_costs: list[dict[str, Any]]) -> dict[str, Any]: - '''return the top level node of @node''' - if node["Top Level Node"] == "True": - return node - - tln_name = get_top_level_node_op_name(node) - for nc in node_costs: - if nc["Op Name"] == tln_name: - return nc - raise ValueError(f"Top level node not found for node: {node}") - - def get_total_execution_time_ns_from_ops(node_costs: list[Operator.Operator]) -> int: '''return the total execution time in ns from @node_costs''' return sum([ From 1ecd513d85f5420c8ee5301f94b8686c04102ebe Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 13:41:09 -0600 Subject: [PATCH 02/12] move some functions from npusim_lib to util --- neusim/npusim/backend/npusim_lib.py | 57 ++---------------------- neusim/npusim/backend/tests/test_util.py | 23 ++++++++++ neusim/npusim/backend/util.py | 37 +++++++++++++++ 3 files changed, 64 insertions(+), 53 deletions(-) diff --git a/neusim/npusim/backend/npusim_lib.py b/neusim/npusim/backend/npusim_lib.py index f41b147..781a2ca 100644 --- a/neusim/npusim/backend/npusim_lib.py +++ b/neusim/npusim/backend/npusim_lib.py @@ -22,42 +22,6 @@ ] -def parse_input_tensor_shapes(input_tensor_shape_str: str) -> Tuple[List[List[int]], List[str]]: - '''Returns (shape, dtype)''' - tensor_shape_regex_pattern = r",*DT_[A-Z|0-9]+:" - shapes = re.split(tensor_shape_regex_pattern, input_tensor_shape_str) - shapes = [x for x in shapes if len(x) > 0] - shapes = [x.removeprefix("[").removesuffix("]") for x in shapes] - shapes = [ - [int(y) for y in x.split(",")] - for x in shapes - ] - - dtype_regex_pattern = r"DT_[A-Z|0-9]+:" - dtypes = re.findall(dtype_regex_pattern, input_tensor_shape_str) - dtypes = [x.removesuffix(":") for x in dtypes] - - return shapes, dtypes - - -def parse_output_tensor_shapes(output_tensor_shape_str: str) -> Tuple[List[List[int]], List[str]]: - '''Returns (shape, dtype)''' - tensor_shape_regex_pattern = r",*DT_[A-Z|0-9]+:" - shapes = re.split(tensor_shape_regex_pattern, output_tensor_shape_str) - shapes = [x.removeprefix("[").removesuffix("]").removesuffix("],[") for x in shapes] - shapes = [x for x in shapes if len(x) > 0] - shapes = [ - [int(y) for y in x[1:-1].split(",")] # y is like "(1, 2, 3)" - for x in shapes - ] - - dtype_regex_pattern = r"DT_[A-Z|0-9]+:" - dtypes = re.findall(dtype_regex_pattern, output_tensor_shape_str) - dtypes = [x.removesuffix(":") for x in dtypes] - - return shapes, dtypes - - def parse_tensor_shapes_for_node_cost_conv(node_cost: Operator.Operator, hlo_module: hlo_struct.HLOModule) -> Tuple[hlo_struct.HLOInstruction, Operator.Operator]: # if node_cost.opcode not in ["convolution", "Conv2D", "Einsum", "MatMul", "BatchMatMulV2"]: if node_cost.opcode_type not in [Operator.OpcodeType.CONV2D, Operator.OpcodeType.EINSUM]: @@ -65,8 +29,8 @@ def parse_tensor_shapes_for_node_cost_conv(node_cost: Operator.Operator, hlo_mod assert isinstance(node_cost, (Operator.EinsumOperator, Operator.Conv2DOperator)), \ f"node_cost is not an Einsum or Conv2D operator: {node_cost}" - input_shapes, input_dtypes = parse_input_tensor_shapes(node_cost.input_tensor_shape_str) - output_shapes, output_dtypes = parse_output_tensor_shapes(node_cost.output_tensor_shape_str) + input_shapes, input_dtypes = util.parse_input_tensor_shapes(node_cost.input_tensor_shape_str) + output_shapes, output_dtypes = util.parse_output_tensor_shapes(node_cost.output_tensor_shape_str) assert len(output_shapes) == 1, f"Output tensor shapes not expected: {output_shapes}" op_name = node_cost.name.split("/")[-1] @@ -166,8 +130,8 @@ def try_check_einsum() -> bool: def parse_tensor_shapes_for_node_cost_default(node_cost: Operator.Operator, hlo_module: hlo_struct.HLOModule) -> tuple[hlo_struct.HLOInstruction, Operator.Operator]: - input_shapes, input_dtypes = parse_input_tensor_shapes(node_cost.input_tensor_shape_str) - output_shapes, output_dtypes = parse_output_tensor_shapes(node_cost.output_tensor_shape_str) + input_shapes, input_dtypes = util.parse_input_tensor_shapes(node_cost.input_tensor_shape_str) + output_shapes, output_dtypes = util.parse_output_tensor_shapes(node_cost.output_tensor_shape_str) assert len(output_shapes) == 1, f"Output tensor shapes not expected: {output_shapes}" op_name = node_cost.name.split("/")[-1] @@ -1570,19 +1534,6 @@ def compute_node_cost_compute_time( raise NotImplementedError(f"Op type {op_type} not supported") -# def compute_op_compute_time( -# I: hlo_struct.HLOInstruction, -# op: Operator.Operator, -# config: ChipConfig, -# ) -> tuple[int, int]: -# ''' -# Return (MXU, VPU) Time -# based on the op type and tensor shapes in @I and @op. -# ''' -# nc = Operator.to_csv_dict(op) -# return compute_node_cost_compute_time(I, nc, config) - - def update_node_cost_compute_time( I: hlo_struct.HLOInstruction, node_cost: dict[str, Any], diff --git a/neusim/npusim/backend/tests/test_util.py b/neusim/npusim/backend/tests/test_util.py index 998474d..ae26772 100644 --- a/neusim/npusim/backend/tests/test_util.py +++ b/neusim/npusim/backend/tests/test_util.py @@ -158,3 +158,26 @@ def test_calculate_bandwidths(self): n_lanes=128, n_sublanes=8, n_ports=2, freq_GHz=1.75 ) self.assertAlmostEqual(vpu_bw, 3584.0) + + def test_parse_input_tensor_shapes(self): + input_str = "DT_FLOAT:[1,128],DT_INT:[128,64]" + shapes, dtypes = util_under_test.parse_input_tensor_shapes(input_str) + self.assertEqual(shapes, [[1, 128], [128, 64]]) + self.assertEqual(dtypes, ["DT_FLOAT", "DT_INT"]) + + # Test single tensor + input_str_single = "DT_FLOAT:[1,128]" + shapes, dtypes = util_under_test.parse_input_tensor_shapes(input_str_single) + self.assertEqual(shapes, [[1, 128]]) + self.assertEqual(dtypes, ["DT_FLOAT"]) + + def test_parse_output_tensor_shapes(self): + output_str = "[DT_FLOAT:(1,128)]" + shapes, dtypes = util_under_test.parse_output_tensor_shapes(output_str) + self.assertEqual(shapes, [[1, 128]]) + self.assertEqual(dtypes, ["DT_FLOAT"]) + + output_str_multi = "[DT_FLOAT:(1,128),DT_INT:(128,64)]" + shapes, dtypes = util_under_test.parse_output_tensor_shapes(output_str_multi) + self.assertEqual(shapes, [[1, 128], [128, 64]]) + self.assertEqual(dtypes, ["DT_FLOAT", "DT_INT"]) diff --git a/neusim/npusim/backend/util.py b/neusim/npusim/backend/util.py index e93e333..537d269 100644 --- a/neusim/npusim/backend/util.py +++ b/neusim/npusim/backend/util.py @@ -3,6 +3,7 @@ import glob from math import ceil, sqrt import os +import re from typing import Any, Sequence import neusim.npusim.frontend.Operator as Operator @@ -10,6 +11,42 @@ import neusim.xla_hlo_parser.xla_hlo_structures as hlo_struct +def parse_input_tensor_shapes(input_tensor_shape_str: str) -> tuple[list[list[int]], list[str]]: + '''Returns (shape, dtype)''' + tensor_shape_regex_pattern = r",*DT_[A-Z|0-9]+:" + shapes = re.split(tensor_shape_regex_pattern, input_tensor_shape_str) + shapes = [x for x in shapes if len(x) > 0] + shapes = [x.removeprefix("[").removesuffix("]") for x in shapes] + shapes = [ + [int(y) for y in x.split(",")] + for x in shapes + ] + + dtype_regex_pattern = r"DT_[A-Z|0-9]+:" + dtypes = re.findall(dtype_regex_pattern, input_tensor_shape_str) + dtypes = [x.removesuffix(":") for x in dtypes] + + return shapes, dtypes + + +def parse_output_tensor_shapes(output_tensor_shape_str: str) -> tuple[list[list[int]], list[str]]: + '''Returns (shape, dtype)''' + tensor_shape_regex_pattern = r",*DT_[A-Z|0-9]+:" + shapes = re.split(tensor_shape_regex_pattern, output_tensor_shape_str) + shapes = [x.removeprefix("[").removesuffix("]").removesuffix("],[") for x in shapes] + shapes = [x for x in shapes if len(x) > 0] + shapes = [ + [int(y) for y in x[1:-1].split(",")] # y is like "(1, 2, 3)" + for x in shapes + ] + + dtype_regex_pattern = r"DT_[A-Z|0-9]+:" + dtypes = re.findall(dtype_regex_pattern, output_tensor_shape_str) + dtypes = [x.removesuffix(":") for x in dtypes] + + return shapes, dtypes + + def get_size_bytes_from_dtype(dtype: str) -> int: if "8" in dtype: return 1 From 9437318943b268cab3be9a815970deb392289499 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 14:00:40 -0600 Subject: [PATCH 03/12] add unit tests for npusim.frontend.util --- neusim/npusim/backend/npusim_lib.py | 40 ------- .../{test_util.py => test_backend_util.py} | 0 .../frontend/tests/test_frontend_util.py | 104 ++++++++++++++++++ 3 files changed, 104 insertions(+), 40 deletions(-) rename neusim/npusim/backend/tests/{test_util.py => test_backend_util.py} (100%) create mode 100644 neusim/npusim/frontend/tests/test_frontend_util.py diff --git a/neusim/npusim/backend/npusim_lib.py b/neusim/npusim/backend/npusim_lib.py index 781a2ca..eb3c4c9 100644 --- a/neusim/npusim/backend/npusim_lib.py +++ b/neusim/npusim/backend/npusim_lib.py @@ -61,16 +61,6 @@ def parse_tensor_shapes_for_node_cost_conv(node_cost: Operator.Operator, hlo_mod ax.size = output_shapes[0][ax.index] ax.data_type = output_dtypes[0] - # # special hack for each model - # if "dlrm" in hlo_module.name: - # I.metadata["op_type"] = "MatMul" - # elif "bert" in hlo_module.name: - # if I.metadata["op_type"] == "unknown": - # I.metadata["op_type"] = "Einsum" - # elif "transformer" in hlo_module.name: - # if I.metadata["op_type"] == "unknown": - # I.metadata["op_type"] = "Einsum" - def try_check_einsum() -> bool: ''' try to check the parsed axes as an Einsum operator. @@ -105,27 +95,6 @@ def try_check_einsum() -> bool: # otherwise treat this op as a conv2d I.metadata["op_type"] = "Conv2D" - # # for MatMul: remove axes of size 1 that is used as placeholder in HLO - # if I.metadata["op_type"] in EINSUM_OP_TYPES + ["unknown"]: - # I.input_axes[0] = [ax for ax in I.input_axes[0] if ax.size != 1] - # I.input_axes[1] = [ax for ax in I.input_axes[1] if ax.size != 1] - # I.output_axes = [ax for ax in I.output_axes if ax.size != 1] - - # For Einsum/MatMul, axes with the same name have the same size; - # for Convolution, spatial0/1 axes have different sizes in kernel (input1). ? - # all_axes = I.input_axes[0] + I.input_axes[1] + I.output_axes - # axes_dict = {} - # for ax in all_axes: - # if ax.name not in axes_dict: - # axes_dict[ax.name] = ax.size - # else: - # if I.metadata["op_type"] in EINSUM_OP_TYPES: - # # For Einsum/MatMul, check axis sizes match - # assert axes_dict[ax.name] == ax.size, f"axes size not consistent: {ax.name}" - # else: - # # If not match, guess this is a Conv2D - # I.metadata["op_type"] = "Conv2D" - return I, node_cost @@ -169,15 +138,6 @@ def parse_tensor_shapes_for_node_cost(node_cost: Operator.Operator, hlo_module: return parse_tensor_shapes_for_node_cost_default(node_cost, hlo_module) -# def parse_tensor_shapes_for_op(op: Operator.Operator, hlo_module: hlo_struct.HLOModule) -> tuple[hlo_struct.HLOInstruction, Operator.Operator]: -# if op.opcode in ["convolution", "Conv2D", "Einsum", "MatMul", "BatchMatMulV2"]: -# I, op_dict = parse_tensor_shapes_for_node_cost_conv(Operator.to_csv_dict(op), hlo_module) -# else: -# I, op_dict = parse_tensor_shapes_for_node_cost_default(Operator.to_csv_dict(op), hlo_module) - -# return I, Operator.from_csv_dict(op_dict) - - def separate_axes_by_type_for_matmul( lhs_axes: Sequence[hlo_struct.HLOAxis], rhs_axes: Sequence[hlo_struct.HLOAxis], diff --git a/neusim/npusim/backend/tests/test_util.py b/neusim/npusim/backend/tests/test_backend_util.py similarity index 100% rename from neusim/npusim/backend/tests/test_util.py rename to neusim/npusim/backend/tests/test_backend_util.py diff --git a/neusim/npusim/frontend/tests/test_frontend_util.py b/neusim/npusim/frontend/tests/test_frontend_util.py new file mode 100644 index 0000000..ff38fe1 --- /dev/null +++ b/neusim/npusim/frontend/tests/test_frontend_util.py @@ -0,0 +1,104 @@ +import unittest +from unittest.mock import MagicMock, patch +import zipfile + +import neusim.npusim.frontend.util as util_under_test +from neusim.configs.models.ModelConfig import ModelConfig +from neusim.npusim.frontend.Operator import Operator + +class TestNPUSimFrontendUtil(unittest.TestCase): + def test_get_factors(self): + self.assertEqual(util_under_test.get_factors(1), [1]) + self.assertEqual(util_under_test.get_factors(6), [1, 2, 3, 6]) + self.assertEqual(util_under_test.get_factors(7), [1, 7]) + self.assertEqual(util_under_test.get_factors(12), [1, 2, 3, 4, 6, 12]) + + def test_prime_factorize(self): + self.assertEqual(util_under_test.prime_factorize(1), []) + self.assertEqual(util_under_test.prime_factorize(2), [2]) + self.assertEqual(util_under_test.prime_factorize(3), [3]) + self.assertEqual(util_under_test.prime_factorize(4), [2, 2]) + self.assertEqual(util_under_test.prime_factorize(6), [2, 3]) + self.assertEqual(util_under_test.prime_factorize(12), [2, 2, 3]) + self.assertEqual(util_under_test.prime_factorize(30), [2, 3, 5]) + + def test_split_parallelism_degree(self): + # p_degree=1, n_axes=1 -> [1] + self.assertEqual(util_under_test.split_parallelism_degree(1, 1), [1]) + # p_degree=4, n_axes=2 -> [2, 2] + self.assertEqual(util_under_test.split_parallelism_degree(4, 2), [2, 2]) + # p_degree=8, n_axes=3 -> [2, 2, 2] + self.assertEqual(util_under_test.split_parallelism_degree(8, 3), [2, 2, 2]) + # p_degree=6, n_axes=2 -> [2, 3] or [3, 2] + self.assertEqual(sorted(util_under_test.split_parallelism_degree(6, 2)), [2, 3]) + + def test_get_ICI_topology_from_num_chips(self): + config = MagicMock(spec=ModelConfig) + + # 2D case + config.ICI_topology = "MESH_2D" + config.num_chips = 4 + # expected [2, 2] + self.assertEqual(sorted(util_under_test.get_ICI_topology_from_num_chips(config)), [2, 2]) + + # 3D case + config.ICI_topology = "TORUS_3D" + config.num_chips = 8 + self.assertEqual(sorted(util_under_test.get_ICI_topology_from_num_chips(config)), [2, 2, 2]) + + def test_get_bisection_bw_per_chip_GBps(self): + config = MagicMock(spec=ModelConfig) + config.ici_bw_GBps = 100.0 + + with patch('neusim.npusim.frontend.util.get_ICI_topology_from_num_chips') as mock_topo: + # 1D case (forced by return value) + mock_topo.return_value = [4] + config.num_chips = 4 + bw, topo = util_under_test.get_bisection_bw_per_chip_GBps(config) + self.assertEqual(bw, 25.0) + self.assertEqual(topo, [4]) + + # 2D case + mock_topo.return_value = [4, 4] + config.num_chips = 16 + bw, topo = util_under_test.get_bisection_bw_per_chip_GBps(config) + self.assertEqual(bw, 25.0) + + # 3D case + mock_topo.return_value = [4, 4, 4] + config.num_chips = 64 + bw, topo = util_under_test.get_bisection_bw_per_chip_GBps(config) + self.assertEqual(bw, 50.0) + + def test_compute_component_slack_for_op(self): + op = MagicMock() + op.stats.execution_time_ns = 100 + op.stats.sa_time_ns = 80 + op.stats.vu_time_ns = 50 + op.stats.memory_time_ns = 100 + op.stats.ici_time_ns = 0 + op.stats.vmem_time_ns = 120 + + extras, ratios = util_under_test.compute_component_slack_for_op(op) + + self.assertEqual(extras["sa"], 20) + self.assertEqual(ratios["sa"], 0.25) # 20/80 + self.assertEqual(extras["vu"], 50) + self.assertEqual(ratios["vu"], 1.0) # 50/50 + self.assertEqual(extras["hbm"], 0) + self.assertEqual(ratios["hbm"], 0.0) + self.assertEqual(extras["ici"], 0) + self.assertEqual(ratios["ici"], 0.0) + self.assertEqual(extras["vmem"], 0) + self.assertEqual(ratios["vmem"], 0.0) + + def test_open_zip(self): + with patch('zipfile.ZipFile') as mock_zip: + # Test existing zip + util_under_test.open_zip("test.zip") + mock_zip.assert_called_with(file="test.zip", mode="r", compression=zipfile.ZIP_DEFLATED) + + # Test non-existing file (adds .zip) + with patch('os.path.exists', return_value=False): + util_under_test.open_zip("new_file", mode="w") + mock_zip.assert_called_with(file="new_file.zip", mode="w", compression=zipfile.ZIP_DEFLATED) From 082d9e2cc77f9e3ac97fd59c8242aa734b36971a Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 14:18:48 -0600 Subject: [PATCH 04/12] add linting, format check, and github workflows --- .github/workflows/lint.yml | 19 +++++++++++++++++++ .github/workflows/test.yml | 19 +++++++++++++++++++ lint.sh | 11 +++++++++++ pyproject.toml | 18 ++++++++++++++++-- 4 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 .github/workflows/test.yml create mode 100755 lint.sh diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..a7eb94d --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,19 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12.2' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Run linting + run: ./lint.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..9f27187 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,19 @@ +name: Test + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12.2' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Run tests + run: pytest --cov=neusim diff --git a/lint.sh b/lint.sh new file mode 100755 index 0000000..4cc84ea --- /dev/null +++ b/lint.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +echo "Running ruff check..." +ruff check . + +echo "Running ruff format check..." +ruff format --check . + +echo "Running mypy..." +mypy . diff --git a/pyproject.toml b/pyproject.toml index 8f18421..bb91909 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,5 +26,19 @@ dependencies = [ [project.optional-dependencies] dev = [ "pytest==8.4.2", - "pytest-cov==7.0.0" -] \ No newline at end of file + "pytest-cov==7.0.0", + "ruff==0.1.13", + "mypy==1.8.0" +] + +[tool.ruff] +target-version = "py312" +select = ["E", "F", "I", "B", "UP"] +ignore = ["E501"] + +[tool.mypy] +python_version = "3.12" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +check_untyped_defs = true \ No newline at end of file From 19e1f38bfecb538f0ad5af7a7727970dac91f572 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 18:15:40 -0600 Subject: [PATCH 05/12] add CI status display in readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 59e436c..76d409b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # NeuSim: An Open-source Simulator Framework for NPUs +[![CI Lint Status](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml/badge.svg)](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml) [![CI Test Status](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml/badge.svg)](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml) + NeuSim is a simulator framework for modeling the performance and power behaviors of neural processing units (NPUs) when running machine learning workloads. ### 📌 Neural Processing Unit 101 From 0f90d2f57ef8c6566e2d9e848ea5faf863e2ef33 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 18:29:03 -0600 Subject: [PATCH 06/12] relax linting --- lint.sh | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/lint.sh b/lint.sh index 4cc84ea..5a185c6 100755 --- a/lint.sh +++ b/lint.sh @@ -1,11 +1,39 @@ #!/bin/bash -set -e +# Note: set -e is not used globally to allow non-blocking checks to run without exiting immediately -echo "Running ruff check..." -ruff check . +FAILURE=0 -echo "Running ruff format check..." -ruff format --check . +echo "Running ruff check (Critical Errors)..." +# Critical errors verify correctness: +# E9: Syntax errors +# F821: Undefined name +# F822: Undefined export in __all__ +# F823: Local variable referenced before assignment +ruff check --select E9,F821,F822,F823 . +if [ $? -ne 0 ]; then + echo "Critical linting errors found!" + FAILURE=1 +fi -echo "Running mypy..." -mypy . +echo "Running ruff check (Warnings - Style/Complexity)..." +# Warnings (non-blocking): +# E: pycodestyle errors +# I: isort imports +# B: flake8-bugbear +# UP: pyupgrade +# F: All Pyflakes (including unused imports/variables) that are not in critical +ruff check --select E,I,B,UP,F --exit-zero . + +echo "Running ruff format check (Non-blocking)..." +ruff format --check . || echo "Ruff format check failed (warning only)" + +echo "Running mypy (Non-blocking)..." +mypy . || echo "Mypy type checking failed (warning only)" + +if [ $FAILURE -ne 0 ]; then + echo "Linting failed due to critical errors." + exit 1 +fi + +echo "Linting passed." +exit 0 From f0068de222ea0df74f6e77b24b2f4bb4ad3002d9 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Mon, 12 Jan 2026 18:34:48 -0600 Subject: [PATCH 07/12] fix readme badge url to point to current branch --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76d409b..83fdd97 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # NeuSim: An Open-source Simulator Framework for NPUs -[![CI Lint Status](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml/badge.svg)](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml) [![CI Test Status](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml/badge.svg)](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml) +[![CI Lint Status](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml/badge.svg?branch=add_unit_tests)](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml) [![CI Test Status](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml/badge.svg?branch=add_unit_tests)](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml) NeuSim is a simulator framework for modeling the performance and power behaviors of neural processing units (NPUs) when running machine learning workloads. From c1a33ec7d5a1546c7cf2f6941f5061dcec943e9e Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Wed, 14 Jan 2026 15:42:44 -0600 Subject: [PATCH 08/12] add unit tests for neusim.configs --- .github/workflows/test.yml | 2 +- neusim/configs/models/LLMConfig.py | 2 +- neusim/configs/tests/test_chip_config.py | 97 ++++++++++++++++++++++ neusim/configs/tests/test_dit_config.py | 17 ++++ neusim/configs/tests/test_dlrm_config.py | 23 +++++ neusim/configs/tests/test_gligen_config.py | 18 ++++ neusim/configs/tests/test_llm_config.py | 88 ++++++++++++++++++++ neusim/configs/tests/test_model_config.py | 35 ++++++++ neusim/configs/tests/test_system_config.py | 16 ++++ pyproject.toml | 7 +- 10 files changed, 302 insertions(+), 3 deletions(-) create mode 100644 neusim/configs/tests/test_chip_config.py create mode 100644 neusim/configs/tests/test_dit_config.py create mode 100644 neusim/configs/tests/test_dlrm_config.py create mode 100644 neusim/configs/tests/test_gligen_config.py create mode 100644 neusim/configs/tests/test_llm_config.py create mode 100644 neusim/configs/tests/test_model_config.py create mode 100644 neusim/configs/tests/test_system_config.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9f27187..7ddcbab 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,4 +16,4 @@ jobs: python -m pip install --upgrade pip pip install -e .[dev] - name: Run tests - run: pytest --cov=neusim + run: pytest diff --git a/neusim/configs/models/LLMConfig.py b/neusim/configs/models/LLMConfig.py index bc1ccc5..977639a 100644 --- a/neusim/configs/models/LLMConfig.py +++ b/neusim/configs/models/LLMConfig.py @@ -102,7 +102,7 @@ def num_experts_per_token(self) -> int: def __init__(self, **kwargs): if "moe_d_ff" not in kwargs: # If moe_d_ff is not provided, default to d_ff. - kwargs["moe_d_ff"] = kwargs["d_ff"] + kwargs["moe_d_ff"] = kwargs.get("d_ff", self.model_fields["d_ff"].default) super().__init__(**kwargs) def __hash__(self) -> int: diff --git a/neusim/configs/tests/test_chip_config.py b/neusim/configs/tests/test_chip_config.py new file mode 100644 index 0000000..73ac7c2 --- /dev/null +++ b/neusim/configs/tests/test_chip_config.py @@ -0,0 +1,97 @@ +import unittest +from neusim.configs.chips.ChipConfig import ChipConfig + +class TestChipConfig(unittest.TestCase): + def test_chip_config_properties(self): + # Create a default ChipConfig + config = ChipConfig() + + # Test vmem_bw_GBps + # Default: num_vu_ports=6, freq_GHz=1.75 + # Expected: 6 * 1.75 * 8 * 128 * 4 = 43008.0 + self.assertAlmostEqual(config.vmem_bw_GBps, 43008.0) + + # Test static_power_hbm_W + # Default: static_power_hbm_mc_W=10.264041296, static_power_hbm_phy_W=15.396061944 + expected_static_hbm = 10.264041296 + 15.396061944 + self.assertAlmostEqual(config.static_power_hbm_W, expected_static_hbm) + + # Test peak_SA_tflops_per_sec + # Default: num_sa=8, sa_dim=128, freq_GHz=1.75 + # Expected: 8 * (128^2) * 2 * 1.75 * 1e9 / 1e12 = 0.458752 + expected_sa_tflops = 8 * (128 ** 2) * 2 * 1.75 / 1000 + self.assertAlmostEqual(config.peak_SA_tflops_per_sec, expected_sa_tflops) + + # Test peak_VU_tflops_per_sec + # Default: num_vu=6, freq_GHz=1.75 + # Expected: 6 * (8 * 128) * 1.75 * 1e9 / 1e12 = 0.010752 + expected_vu_tflops = 6 * (8 * 128) * 1.75 / 1000 + self.assertAlmostEqual(config.peak_VU_tflops_per_sec, expected_vu_tflops) + + # Test peak_tflops_per_sec + self.assertAlmostEqual(config.peak_tflops_per_sec, expected_sa_tflops + expected_vu_tflops) + + # Test static_power_sa_W + # Default: static_power_W_per_sa=1.35868996, num_sa=8 + expected_static_sa = 1.35868996 * 8 + self.assertAlmostEqual(config.static_power_sa_W, expected_static_sa) + + # Test static_power_vu_W + # Default: static_power_W_per_vu=0.475076728, num_vu=6 + expected_static_vu = 0.475076728 * 6 + self.assertAlmostEqual(config.static_power_vu_W, expected_static_vu) + + # Test static_power_vmem_W_per_MB + # Default: static_power_vmem_W=24.21353615, vmem_size_MB=128 + expected_static_vmem_per_mb = 24.21353615 / 128 + self.assertAlmostEqual(config.static_power_vmem_W_per_MB, expected_static_vmem_per_mb) + + # Test static_power_W + # Sum of components + expected_static_total = ( + expected_static_sa + + expected_static_vu + + config.static_power_vmem_W + + config.static_power_ici_W + + expected_static_hbm + + config.static_power_other_W + ) + self.assertAlmostEqual(config.static_power_W, expected_static_total) + + # Test idle_power_W + self.assertAlmostEqual(config.idle_power_W, expected_static_total) + + # Test dynamic_power_sa_W + # Default: dynamic_power_W_per_SA=28.19413333, num_sa=8 + expected_dynamic_sa = 28.19413333 * 8 + self.assertAlmostEqual(config.dynamic_power_sa_W, expected_dynamic_sa) + + # Test dynamic_power_vu_W + # Default: dynamic_power_W_per_VU=2.65216, num_vu=6 + expected_dynamic_vu = 2.65216 * 6 + self.assertAlmostEqual(config.dynamic_power_vu_W, expected_dynamic_vu) + + # Test dynamic_power_hbm_W + # Default: hbm_bw_GBps=2765, dynamic_power_hbm_W_per_GBps=0.01261538462 + expected_dynamic_hbm = 2765 * 0.01261538462 + self.assertAlmostEqual(config.dynamic_power_hbm_W, expected_dynamic_hbm) + + # Test dynamic_power_ici_W + # Default: ici_bw_GBps=200, dynamic_power_ici_W_per_GBps=0.01767315271 + expected_dynamic_ici = 200 * 0.01767315271 + self.assertAlmostEqual(config.dynamic_power_ici_W, expected_dynamic_ici) + + # Test dynamic_power_peak_W + # Sum of components + expected_dynamic_total = ( + expected_dynamic_sa + + expected_dynamic_vu + + config.dynamic_power_vmem_W + + expected_dynamic_ici + + expected_dynamic_hbm + + config.dynamic_power_other_W + ) + self.assertAlmostEqual(config.dynamic_power_peak_W, expected_dynamic_total) + + # Test total_power_peak_W + self.assertAlmostEqual(config.total_power_peak_W, expected_static_total + expected_dynamic_total) diff --git a/neusim/configs/tests/test_dit_config.py b/neusim/configs/tests/test_dit_config.py new file mode 100644 index 0000000..7dc3097 --- /dev/null +++ b/neusim/configs/tests/test_dit_config.py @@ -0,0 +1,17 @@ +import unittest +from neusim.configs.models.DiTConfig import DiTConfig + +class TestDiTConfig(unittest.TestCase): + def test_dit_config_instantiation(self): + # Create DiTConfig + config = DiTConfig( + image_width=256, + num_channels=3, + patch_size=16, + num_diffusion_steps=1000 + ) + + self.assertEqual(config.model_type, "dit") + self.assertEqual(config.image_width, 256) + self.assertEqual(config.num_channels, 3) + self.assertEqual(config.input_seqlen, 0) diff --git a/neusim/configs/tests/test_dlrm_config.py b/neusim/configs/tests/test_dlrm_config.py new file mode 100644 index 0000000..3b15384 --- /dev/null +++ b/neusim/configs/tests/test_dlrm_config.py @@ -0,0 +1,23 @@ +import unittest +from neusim.configs.models.DLRMConfig import DLRMConfig, MLPLayerConfig + +class TestDLRMConfig(unittest.TestCase): + def test_dlrm_config_instantiation(self): + # Create DLRMConfig + bottom_mlp = [MLPLayerConfig(in_features=10, out_features=20)] + top_mlp = [MLPLayerConfig(in_features=20, out_features=1)] + + config = DLRMConfig( + model_type="dlrm", + embedding_dim=64, + num_indices_per_lookup=[100, 200], + embedding_table_sizes=[1000, 2000], + num_dense_features=10, + bottom_mlp_config=bottom_mlp, + top_mlp_config=top_mlp + ) + + self.assertEqual(config.model_type, "dlrm") + self.assertEqual(config.embedding_dim, 64) + self.assertEqual(len(config.bottom_mlp_config), 1) + self.assertEqual(config.bottom_mlp_config[0].in_features, 10) diff --git a/neusim/configs/tests/test_gligen_config.py b/neusim/configs/tests/test_gligen_config.py new file mode 100644 index 0000000..b133953 --- /dev/null +++ b/neusim/configs/tests/test_gligen_config.py @@ -0,0 +1,18 @@ +import unittest +from neusim.configs.models.GLIGENConfig import GLIGENConfig + +class TestGLIGENConfig(unittest.TestCase): + def test_gligen_config_instantiation(self): + # Create GLIGENConfig + config = GLIGENConfig() + + self.assertEqual(config.model_type, "gligen") + self.assertEqual(config.num_diffusion_steps, 1) + + # Test nested configs default values + self.assertEqual(config.fourier_embedder_config.num_freqs, 64) + self.assertEqual(config.text_embedder_config.d_model, 512) + self.assertEqual(config.image_embedder_config.d_model, 1024) + self.assertEqual(config.spatial_condition_embedder_config.stem.in_channels, 3) + self.assertEqual(config.grounding_input_config.text.input_seqlen, 512) + self.assertEqual(config.unet_config.model_channels, 320) diff --git a/neusim/configs/tests/test_llm_config.py b/neusim/configs/tests/test_llm_config.py new file mode 100644 index 0000000..9d5f6f2 --- /dev/null +++ b/neusim/configs/tests/test_llm_config.py @@ -0,0 +1,88 @@ +import unittest +from neusim.configs.models.LLMConfig import LLMConfig, MoELLMConfig, DeepSeekConfig + +class TestLLMConfig(unittest.TestCase): + def test_llm_config(self): + # Test default init + config = LLMConfig() + # Default: num_heads=64, num_kv_heads should default to num_heads if not provided + self.assertEqual(config.num_heads, 64) + self.assertEqual(config.num_kv_heads, 64) + + # Test valid init with num_kv_heads + config_mqa = LLMConfig(num_kv_heads=1) + self.assertEqual(config_mqa.num_kv_heads, 1) + + # Test hash + config1 = LLMConfig() + config2 = LLMConfig() + self.assertEqual(hash(config1), hash(config2)) + +class TestMoELLMConfig(unittest.TestCase): + def test_moe_llm_config(self): + # Test default init + config = MoELLMConfig() + # Default: d_ff=11008, moe_d_ff should default to d_ff + self.assertEqual(config.d_ff, 11008) + self.assertEqual(config.moe_d_ff, 11008) + + # Test custom moe_d_ff + config_custom = MoELLMConfig(moe_d_ff=2048) + self.assertEqual(config_custom.moe_d_ff, 2048) + + # Test expert_tensor_parallelism_degree + # dp=1, tp=1, ep=1 -> 1*1 // 1 = 1 + self.assertEqual(config.expert_tensor_parallelism_degree, 1) + + # dp=2, tp=2, ep=2 -> 2*2 // 2 = 2 + config_parallel = MoELLMConfig( + data_parallelism_degree=2, + tensor_parallelism_degree=2, + expert_parallelism_degree=2 + ) + self.assertEqual(config_parallel.expert_tensor_parallelism_degree, 2) + + # Test num_expert_tensor_parallel_axes + # ndp=1, ntp=1, nep=1 -> 1+1-1 = 1 + self.assertEqual(config.num_expert_tensor_parallel_axes, 1) + + # ndp=2, ntp=2, nep=2 -> 2+2-2 = 2 + config_axes = MoELLMConfig( + num_data_parallel_axes=2, + num_tensor_parallel_axes=2, + num_expert_parallel_axes=2 + ) + self.assertEqual(config_axes.num_expert_tensor_parallel_axes, 2) + + # Test num_experts_per_token + # shared=1, routed=8 -> 9 + self.assertEqual(config.num_experts_per_token, 9) + + # Test hash + config1 = MoELLMConfig() + config2 = MoELLMConfig() + self.assertEqual(hash(config1), hash(config2)) + +class TestDeepSeekConfig(unittest.TestCase): + def test_deepseek_config(self): + config = DeepSeekConfig( + kv_lora_rank=16, + q_lora_rank=32, + qk_rope_head_dim=64, + qk_nope_head_dim=64, + v_head_dim=128 + ) + + # Test qk_head_dim + # 64 + 64 = 128 + self.assertEqual(config.qk_head_dim, 128) + + # Test hash + config2 = DeepSeekConfig( + kv_lora_rank=16, + q_lora_rank=32, + qk_rope_head_dim=64, + qk_nope_head_dim=64, + v_head_dim=128 + ) + self.assertEqual(hash(config), hash(config2)) diff --git a/neusim/configs/tests/test_model_config.py b/neusim/configs/tests/test_model_config.py new file mode 100644 index 0000000..2bcfa90 --- /dev/null +++ b/neusim/configs/tests/test_model_config.py @@ -0,0 +1,35 @@ +import unittest +from neusim.configs.models.ModelConfig import ModelConfig + +class TestModelConfig(unittest.TestCase): + def test_model_config_hash(self): + # Create two identical ModelConfigs + config1 = ModelConfig( + model_type="llm", + model_name="test_model", + name="test_chip", + global_batch_size=8, + num_chips=4 + ) + config2 = ModelConfig( + model_type="llm", + model_name="test_model", + name="test_chip", + global_batch_size=8, + num_chips=4 + ) + + # Check if their hash is the same + self.assertEqual(hash(config1), hash(config2)) + + # Create a different ModelConfig + config3 = ModelConfig( + model_type="llm", + model_name="test_model_diff", + name="test_chip", + global_batch_size=8, + num_chips=4 + ) + + # Check if their hash is different (likely) + self.assertNotEqual(hash(config1), hash(config3)) diff --git a/neusim/configs/tests/test_system_config.py b/neusim/configs/tests/test_system_config.py new file mode 100644 index 0000000..07c06e6 --- /dev/null +++ b/neusim/configs/tests/test_system_config.py @@ -0,0 +1,16 @@ +import unittest +from neusim.configs.systems.SystemConfig import SystemConfig + +class TestSystemConfig(unittest.TestCase): + def test_system_config_instantiation(self): + # Create a default SystemConfig + config = SystemConfig() + + # Test default values + self.assertEqual(config.PUE, 1.1) + self.assertEqual(config.carbon_intensity_kgCO2_per_kWh, 0.5) + + # Test custom values + config_custom = SystemConfig(PUE=1.2, carbon_intensity_kgCO2_per_kWh=0.6) + self.assertEqual(config_custom.PUE, 1.2) + self.assertEqual(config_custom.carbon_intensity_kgCO2_per_kWh, 0.6) diff --git a/pyproject.toml b/pyproject.toml index bb91909..13896a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,4 +41,9 @@ python_version = "3.12" warn_return_any = true warn_unused_configs = true disallow_untyped_defs = false -check_untyped_defs = true \ No newline at end of file +check_untyped_defs = true + +[tool.pytest.ini_options] +norecursedirs = ["results", "dist", "build", ".git", "*.egg-info"] +testpaths = ["neusim"] +addopts = "--cov=neusim" From 11dc2a626d8934638a525f2960825575f628bcd7 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Wed, 14 Jan 2026 16:21:42 -0600 Subject: [PATCH 09/12] add unit tests for xla_hlo_structures --- .../tests/test_xla_hlo_structures.py | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 neusim/xla_hlo_parser/tests/test_xla_hlo_structures.py diff --git a/neusim/xla_hlo_parser/tests/test_xla_hlo_structures.py b/neusim/xla_hlo_parser/tests/test_xla_hlo_structures.py new file mode 100644 index 0000000..7ce490a --- /dev/null +++ b/neusim/xla_hlo_parser/tests/test_xla_hlo_structures.py @@ -0,0 +1,227 @@ +import unittest +from neusim.xla_hlo_parser.xla_hlo_structures import ( + HLOAxis, HLOType, HLOTuple, HLOTensorType, HLOValue, + HLOInstruction, HLOFunction, HLOModule, HLOModel, + HLOFusedOpInstruction, isFusedOp +) + +class TestXLAHLOStructures(unittest.TestCase): + def test_hlo_axis(self): + axis = HLOAxis(name="batch", index=0, size=32) + self.assertEqual(axis.name, "batch") + self.assertEqual(axis.index, 0) + self.assertEqual(axis.size, 32) + self.assertEqual(axis.tile_size, 32) + self.assertEqual(axis.data_type, "DT_FLOAT") + + def test_hlo_type(self): + t = HLOType(type="tensor", raw_string="f32[32,128]") + self.assertEqual(t.type, "tensor") + self.assertEqual(t.raw_string, "f32[32,128]") + + with self.assertRaises(NotImplementedError): + t.is_scalar() + + def test_hlo_tuple(self): + t1 = HLOType(type="tensor") + t2 = HLOType(type="tensor") + tuple_type = HLOTuple(type_list=[t1, t2], type_str="(f32[], f32[])") + self.assertEqual(tuple_type.type, "tuple") + self.assertFalse(tuple_type.is_scalar()) + self.assertEqual(len(tuple_type.type_list), 2) + + def test_hlo_tensor_type(self): + # Scalar + scalar = HLOTensorType(shape=[1]) + self.assertTrue(scalar.is_scalar()) + + # Not scalar + tensor = HLOTensorType(shape=[32, 128]) + self.assertFalse(tensor.is_scalar()) + self.assertEqual(tensor.shape, [32, 128]) + + def test_hlo_value(self): + t = HLOTensorType(shape=[1]) + val = HLOValue(type=t, name="val1", value=3.14) + self.assertEqual(val.name, "val1") + self.assertEqual(val.value, 3.14) + self.assertTrue(val.is_scalar) + + def test_hlo_instruction_dim_labels(self): + # Conv/Einsum case + res = HLOValue(name="res") + # Standard Conv2D pattern: batch, feature_in, spatial... -> batch, feature_out, spatial... + metadata = {"dim_labels": "b01f_01io->b01f"} + instr = HLOInstruction(result=res, opcode="convolution", metadata=metadata) + + # Check parsed axes + self.assertTrue(hasattr(instr, "input_axes")) + self.assertTrue(hasattr(instr, "output_axes")) + + # lhs: b, 0, 1, f -> batch, spatial0, spatial1, input_channel + lhs_axes = instr.input_axes[0] + self.assertEqual(lhs_axes[0].name, "batch") + self.assertEqual(lhs_axes[1].name, "spatial0") + self.assertEqual(lhs_axes[2].name, "spatial1") + self.assertEqual(lhs_axes[3].name, "input_channel") + + # rhs: 0, 1, i, o -> spatial0, spatial1, input_channel, output_channel + rhs_axes = instr.input_axes[1] + self.assertEqual(rhs_axes[0].name, "spatial0") + self.assertEqual(rhs_axes[2].name, "input_channel") + self.assertEqual(rhs_axes[3].name, "output_channel") + + # out: b, 0, 1, f -> batch, spatial0, spatial1, output_channel + out_axes = instr.output_axes + self.assertEqual(out_axes[0].name, "batch") + self.assertEqual(out_axes[3].name, "output_channel") + + def test_hlo_instruction_conv_config(self): + res = HLOValue(name="res") + metadata = {"window": "{size=3x3 stride=2x2 pad=1_1x1_1}"} + instr = HLOInstruction(result=res, opcode="convolution", metadata=metadata) + + self.assertTrue(hasattr(instr, "convolution_window")) + self.assertEqual(instr.convolution_window["size"], [3, 3]) + self.assertEqual(instr.convolution_window["stride"], [2, 2]) + self.assertEqual(instr.convolution_window["pad"], "1_1x1_1") + self.assertTrue(instr.isConvolution()) + + def test_hlo_instruction_conv_config_complex(self): + # Complex window with dilations + res = HLOValue(name="res") + metadata = {"window": "{size=3x3 stride=2x2 pad=0_1x0_1 lhs_dilation=2x2 rhs_dilation=1x1}"} + instr = HLOInstruction(result=res, opcode="convolution", metadata=metadata) + + self.assertTrue(hasattr(instr, "convolution_window")) + self.assertEqual(instr.convolution_window["size"], [3, 3]) + self.assertEqual(instr.convolution_window["stride"], [2, 2]) + self.assertEqual(instr.convolution_window["pad"], "0_1x0_1") + # Ensure it doesn't crash on extra fields like dilation + self.assertTrue(instr.isConvolution()) + + def test_hlo_function(self): + # Instructions + val1 = HLOValue(name="v1") + instr1 = HLOInstruction(result=val1, opcode="add") + + val2 = HLOValue(name="v2") + instr2 = HLOInstruction(result=val2, opcode="multiply", is_root=True) + + func = HLOFunction(name="main_func", instructions=[instr1, instr2]) + + self.assertEqual(func.name, "main_func") + self.assertEqual(func.ROOT_instruction, instr2) + self.assertEqual(func.ROOT_value, val2) + + # Query by name + self.assertEqual(func.getInstructionByName("v1"), instr1) + self.assertIsNone(func.getInstructionByName("v_not_exist")) + + # Contains opcode + has_add, i_add = func.containsOpcode("add") + self.assertTrue(has_add) + self.assertEqual(i_add, instr1) + + has_conv, _ = func.containsOpcode("convolution") + self.assertFalse(has_conv) + + def test_hlo_module(self): + func1 = HLOFunction(name="func1") + func_entry = HLOFunction(name="main", is_entry=True) + + module = HLOModule(name="module1", functions=[func1, func_entry]) + + self.assertEqual(module.ENTRY, func_entry) + self.assertEqual(len(module.getHLOFunctions()), 2) + + self.assertEqual(module.getFunctionByName("func1"), func1) + self.assertIsNone(module.getFunctionByName("func_missing")) + + def test_hlo_model(self): + module1 = HLOModule(name="module1") + model = HLOModel(name="model", modules=[module1]) + + # Search module + matches = model.searchModuleByName("module1") + self.assertEqual(len(matches), 1) + self.assertEqual(matches[0], module1) + + self.assertEqual(len(model.searchModuleByName("missing")), 0) + + def test_hlo_fused_op_instruction(self): + target_func = HLOFunction(name="fused_computation.1") + # To make it a convolution fusion, it needs "fused_computation" in name AND contain "convolution" op + # Let's add a conv op to target func + mock_conv_res = HLOValue(name="conv_res") + conv_op = HLOInstruction(result=mock_conv_res, opcode="convolution") + target_func.instructions.append(conv_op) + + res = HLOValue(name="res") + fused_op = HLOFusedOpInstruction( + result=res, opcode="fusion", target_name="fused_computation.1", target_func=target_func + ) + + self.assertEqual(fused_op.fusion_type(), "fusion") + self.assertTrue(fused_op.isConvolutionFusion()) + self.assertTrue(fused_op.isConvolution()) + + def test_hlo_tuple_init_none(self): + t = HLOTuple(type_list=None) + self.assertEqual(len(t.type_list), 0) + + def test_hlo_instruction_parse_dim_labels_non_conv(self): + res = HLOValue(name="res") + # Should return early and not raise error or set input_axes + instr = HLOInstruction(result=res, opcode="add", metadata={"dim_labels": "..."}) + self.assertFalse(hasattr(instr, "input_axes")) + + def test_hlo_function_init_default(self): + func = HLOFunction(name="f") + self.assertEqual(len(func.parameters), 0) + self.assertEqual(len(func.instructions), 0) + + def test_hlo_function_root_not_found(self): + func = HLOFunction(name="f", instructions=[ + HLOInstruction(result=HLOValue(name="v"), opcode="add") + ]) + with self.assertRaises(AssertionError): + _ = func.ROOT_instruction + + def test_hlo_module_add_get(self): + module = HLOModule(name="mod", properties="props") + func = HLOFunction(name="f") + module.addHLOFunction(func) + + self.assertEqual(module.getHLOFunctions(), [func]) + + # getInstructionByName + res = HLOValue(name="res") + instr = HLOInstruction(result=res, opcode="add") + func.instructions.append(instr) + + self.assertEqual(module.getInstructionByName("res"), instr) + self.assertIsNone(module.getInstructionByName("missing")) + + def test_hlo_model_get_func(self): + module = HLOModule(name="mod") + func = HLOFunction(name="f") + module.addHLOFunction(func) + model = HLOModel(name="model", modules=[module]) + + matches = model.getFunctionByName("f") + self.assertIn(module, matches) + self.assertEqual(matches[module], func) + + self.assertIsNone(model.getFunctionByName("missing")) + + def test_is_fused_op(self): + res = HLOValue(name="res") + + # Not fused + instr = HLOInstruction(result=res, opcode="add") + self.assertFalse(isFusedOp(instr)) + + # Fused + fused = HLOFusedOpInstruction(result=res, opcode="fusion") + self.assertTrue(isFusedOp(fused)) From 54b67c901d43615c851da429c031cb18628499c8 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Wed, 14 Jan 2026 16:43:22 -0600 Subject: [PATCH 10/12] add unit tests for neusim.npusim.Operator --- neusim/npusim/backend/util.py | 1 - neusim/npusim/frontend/tests/test_operator.py | 128 ++++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 neusim/npusim/frontend/tests/test_operator.py diff --git a/neusim/npusim/backend/util.py b/neusim/npusim/backend/util.py index 537d269..170fb94 100644 --- a/neusim/npusim/backend/util.py +++ b/neusim/npusim/backend/util.py @@ -7,7 +7,6 @@ from typing import Any, Sequence import neusim.npusim.frontend.Operator as Operator -import neusim.xla_hlo_parser.xla_hlo_trace_parser as hlo_parser import neusim.xla_hlo_parser.xla_hlo_structures as hlo_struct diff --git a/neusim/npusim/frontend/tests/test_operator.py b/neusim/npusim/frontend/tests/test_operator.py new file mode 100644 index 0000000..b5e62e4 --- /dev/null +++ b/neusim/npusim/frontend/tests/test_operator.py @@ -0,0 +1,128 @@ +import unittest +from neusim.npusim.frontend.Operator import ( + Operator, EinsumOperator, Conv2DOperator, FlashAttentionOperator, + OperatorStatistics, EinsumStatistics, FlashAttentionStatistics, + Axis, Tensor, DVFSPolicy, OpcodeType, OpType, + from_csv_dict, to_csv_dict +) + +class TestOperator(unittest.TestCase): + def test_enums(self): + # DVFSPolicy + self.assertEqual(DVFSPolicy.from_str("Ideal"), DVFSPolicy.IDEAL) + self.assertEqual(DVFSPolicy.from_str(None), DVFSPolicy.NONE) + # Verify ValueError is raised for unknown strings, as per implicit behavior of Enum(value) + with self.assertRaises(ValueError): + DVFSPolicy.from_str("Unknown") + + # OpcodeType + self.assertEqual(OpcodeType.from_opcode("Conv2D"), OpcodeType.CONV2D) + self.assertEqual(OpcodeType.from_opcode("Einsum"), OpcodeType.EINSUM) + self.assertEqual(OpcodeType.from_opcode("UnknownOp"), OpcodeType.OTHER) + + # OpType + self.assertEqual(OpType.from_string("MXU"), OpType.MXU) + self.assertEqual(OpType.from_string("Unknown"), OpType.OTHER) + + def test_axis(self): + axis = Axis(name="ax", size=1024, parallelism=[2, 4], tile_size=64) + self.assertEqual(axis.num_shards, 8) + self.assertEqual(axis.shard_size, 128) # 1024 / 8 + self.assertEqual(axis.num_tiles, 2) # 128 / 64 + + # Default + axis_def = Axis(size=100) + self.assertEqual(axis_def.num_shards, 1) + self.assertEqual(axis_def.tile_size, 100) + + def test_operator_statistics(self): + stats = OperatorStatistics() + stats.execution_time_ns = 1000 + stats.sa_time_ns = 500 + stats.vu_time_ns = 600 + + self.assertEqual(stats.compute_time_ns, 600) + + stats.memory_traffic_bytes = 1024**3 # 1 GB + # 1 GB / 1000 ns = 1 GB / 1e-6 s = 1e6 GB/s ? No. + # 1 GB / 1 us. + # hbm_bw_GBps = bytes / 1024^3 / time(ns) * 1e9 + # = 1 * 1e9 / 1000 = 1e6 GBps. + self.assertAlmostEqual(stats.hbm_bw_GBps, 1000000.0) + + stats.static_energy_sa_J = 1.0 + stats.dynamic_energy_sa_J = 0.5 + self.assertEqual(stats.static_energy_J, 1.0) # others 0 + self.assertEqual(stats.dynamic_energy_J, 0.5) + self.assertEqual(stats.total_energy_J, 1.5) + # Power = Energy / Time = 1.5 / 1000ns * 1e9 = 1.5 * 1e6 W + self.assertAlmostEqual(stats.total_power_W, 1.5e6) + + def test_operator_csv(self): + op = Operator(name="test_op", opcode="Add") + op.stats.execution_time_ns = 100 + op.stats.count = 5 + op.op_type = OpType.VPU + + csv_dict = op.to_csv_dict() + self.assertEqual(csv_dict["Name"], "test_op") + self.assertEqual(csv_dict["Op Code"], "Add") + self.assertEqual(csv_dict["Execution time"], 100) + self.assertEqual(csv_dict["OpType"], "VPU") + + # Round trip + new_op = from_csv_dict(csv_dict) + self.assertEqual(new_op.name, "test_op") + self.assertEqual(new_op.stats.execution_time_ns, 100) + self.assertEqual(new_op.op_type, OpType.VPU) + + def test_einsum_operator_csv(self): + op = EinsumOperator(name="matmul", opcode="Einsum") + op.stats.dim_labels_str = "mk,kn->mn" + op.stats.parsed_op_type = "Einsum" # Helper to ensure correct type identification + + csv_dict = op.to_csv_dict() + self.assertEqual(csv_dict["dim_labels"], "mk,kn->mn") + + # Round trip via factory + # Factory relies on "parsed_op_type" or "Op Code" + csv_dict["Op Code"] = "Einsum" + new_op = from_csv_dict(csv_dict) + self.assertIsInstance(new_op, EinsumOperator) + self.assertEqual(new_op.stats.dim_labels_str, "mk,kn->mn") + + def test_conv2d_operator_csv(self): + op = Conv2DOperator(name="conv", opcode="Conv2D") + op.stats.num_sa_ops = 100 + op.stats.parsed_op_type = "Conv2D" + + csv_dict = op.to_csv_dict() + self.assertEqual(csv_dict["num_mxu_ops"], 100) + + csv_dict["Op Code"] = "Conv2D" + new_op = from_csv_dict(csv_dict) + self.assertIsInstance(new_op, Conv2DOperator) + self.assertEqual(new_op.stats.num_sa_ops, 100) + + def test_flash_attention_operator_csv(self): + op = FlashAttentionOperator(name="fa", opcode="FlashAttention") + op.stats.vu_softmax_time_ns = 50 + op.stats.parsed_op_type = "FlashAttention" + + csv_dict = op.to_csv_dict() + self.assertEqual(csv_dict["vu_softmax_time_ns"], 50) + + csv_dict["Op Code"] = "FlashAttention" + new_op = from_csv_dict(csv_dict) + self.assertIsInstance(new_op, FlashAttentionOperator) + self.assertEqual(new_op.stats.vu_softmax_time_ns, 50) + + def test_dvfs_csv_parsing(self): + # Test that DVFS fields are parsed correctly + op_dict = Operator().to_csv_dict() + op_dict["DVFS SA Policy"] = "Ideal" + op_dict["DVFS SA Voltage (V)"] = 0.8 + + op = from_csv_dict(op_dict) + self.assertEqual(op.dvfs_sa.policy, DVFSPolicy.IDEAL) + self.assertEqual(op.dvfs_sa.voltage_V, 0.8) From 926605cfe8741a2e227ba3e1046616783cdcdd54 Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Wed, 14 Jan 2026 17:29:08 -0600 Subject: [PATCH 11/12] add unit tests for neusim.npusim.query_results_helper_lib --- .../inference-v2.csv | 18 + .../inference-v2.json | 139 ++++ .../inference-v2_chip0.csv | 18 + .../inference-v2_chip0.json | 137 ++++ .../inference-v2_config_stats.json | 102 +++ .../inference-v3.csv | 18 + .../inference-v3.json | 139 ++++ .../inference-v3_chip0.csv | 18 + .../inference-v3_chip0.json | 137 ++++ .../inference-v3_config_stats.json | 102 +++ .../inference-v4.csv | 18 + .../inference-v4.json | 139 ++++ .../inference-v4_chip0.csv | 18 + .../inference-v4_chip0.json | 137 ++++ .../inference-v4_config_stats.json | 102 +++ .../inference-v5p.csv | 18 + .../inference-v5p.json | 139 ++++ .../inference-v5p_chip0.csv | 18 + .../inference-v5p_chip0.json | 137 ++++ .../inference-v5p_config_stats.json | 102 +++ .../inference-v6p.csv | 18 + .../inference-v6p.json | 139 ++++ .../inference-v6p_chip0.csv | 18 + .../inference-v6p_chip0.json | 137 ++++ .../inference-v6p_config_stats.json | 102 +++ .../inference-v2.csv | 635 ++++++++++++++++++ .../inference-v2.json | 184 +++++ .../inference-v3.csv | 635 ++++++++++++++++++ .../inference-v3.json | 184 +++++ .../inference-v4.csv | 635 ++++++++++++++++++ .../inference-v4.json | 184 +++++ .../inference-v5p.csv | 635 ++++++++++++++++++ .../inference-v5p.json | 184 +++++ .../inference-v6p.csv | 635 ++++++++++++++++++ .../inference-v6p.json | 184 +++++ .../inference-v2.csv | 29 + .../inference-v2_decode.csv | 17 + .../inference-v2_decode.json | 98 +++ .../inference-v2_prefill.csv | 13 + .../inference-v2_prefill.json | 97 +++ .../inference-v3.csv | 29 + .../inference-v3_decode.csv | 17 + .../inference-v3_decode.json | 98 +++ .../inference-v3_prefill.csv | 13 + .../inference-v3_prefill.json | 97 +++ .../inference-v4.csv | 29 + .../inference-v4_decode.csv | 17 + .../inference-v4_decode.json | 98 +++ .../inference-v4_prefill.csv | 13 + .../inference-v4_prefill.json | 97 +++ .../inference-v5p.csv | 29 + .../inference-v5p_decode.csv | 17 + .../inference-v5p_decode.json | 98 +++ .../inference-v5p_prefill.csv | 13 + .../inference-v5p_prefill.json | 97 +++ .../inference-v6p.csv | 29 + .../inference-v6p_decode.csv | 17 + .../inference-v6p_decode.json | 98 +++ .../inference-v6p_prefill.csv | 13 + .../inference-v6p_prefill.json | 97 +++ .../training-v5p.csv | 46 ++ .../training-v5p.json | 101 +++ .../training-v6p.csv | 46 ++ .../training-v6p.json | 101 +++ .../inference-v6p_decode.json | 98 +++ .../inference-v2.csv | 39 ++ .../inference-v2_decode.csv | 22 + .../inference-v2_decode.json | 98 +++ .../inference-v2_prefill.csv | 18 + .../inference-v2_prefill.json | 97 +++ .../inference-v3.csv | 39 ++ .../inference-v3_decode.csv | 22 + .../inference-v3_decode.json | 98 +++ .../inference-v3_prefill.csv | 18 + .../inference-v3_prefill.json | 97 +++ .../inference-v4.csv | 39 ++ .../inference-v4_decode.csv | 22 + .../inference-v4_decode.json | 98 +++ .../inference-v4_prefill.csv | 18 + .../inference-v4_prefill.json | 97 +++ .../inference-v5p.csv | 39 ++ .../inference-v5p_decode.csv | 22 + .../inference-v5p_decode.json | 98 +++ .../inference-v5p_prefill.csv | 18 + .../inference-v5p_prefill.json | 97 +++ .../inference-v6p.csv | 39 ++ .../inference-v6p_decode.csv | 22 + .../inference-v6p_decode.json | 98 +++ .../inference-v6p_prefill.csv | 18 + .../inference-v6p_prefill.json | 97 +++ .../training-v3.csv | 54 ++ .../training-v3.json | 101 +++ .../training-v4.csv | 54 ++ .../training-v4.json | 101 +++ .../training-v5p.csv | 54 ++ .../training-v5p.json | 101 +++ .../training-v6p.csv | 54 ++ .../training-v6p.json | 101 +++ .../tests/test_query_results_helper.py | 389 +++++++++++ 99 files changed, 10206 insertions(+) create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_config_stats.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_config_stats.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_config_stats.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_config_stats.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_config_stats.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs32/inference-v6p_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv create mode 100644 neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json create mode 100644 neusim/npusim/frontend/tests/test_query_results_helper.py diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv new file mode 100644 index 0000000..355b00a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Compute,915,915,500,0,0,0,0,0,0,0,0,915,45,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,915,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.286496174863388,268.3837557099556,0.0058777609404417515,0.4473062595165927,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,366,500,0,0,0,0,0,0,0,0,366,11,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,366,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.0013445378151260505,0.10371208190917969,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,0.0004261736363176793,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,636,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.027275112724331474,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Compute,1006,1006,500,0,0,0,0,0,0,0,0,1006,51,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,1006,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0011451292246520875,2.1496332189434804,2.3493492824565215e-05,0.0035827220315724673,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Compute,915,915,500,0,0,0,0,0,0,0,0,915,45,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,915,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.286496174863388,269.165455969305,0.0058777609404417515,0.448609093282175,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Compute,6035,6035,3262,0,0,0,0,0,0,0,0,6035,365,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,6035,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.34749826014913004,324.26506996944903,0.007129280875564807,0.5404417832824151,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Compute,6035,6035,3262,0,0,0,0,0,0,0,0,6035,365,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,6035,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.34749826014913004,324.26506996944903,0.007129280875564807,0.5404417832824151,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Compute,915,915,500,0,0,0,0,0,0,0,0,915,45,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,915,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002238251366120219,4.171103727622111,4.5920007347201183e-05,0.006951839546036851,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json new file mode 100644 index 0000000..2839564 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json @@ -0,0 +1,139 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 66941, + "overlapped_compute_time_non_pp_ns": 8938, + "compute_only_time_non_pp_ns": 7297, + "memory_only_time_non_pp_ns": 4086, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 66941, + "overlapped_compute_time_chip_ns": 8938, + "compute_only_time_chip_ns": 7297, + "memory_only_time_chip_ns": 4086, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 14938.527957455073, + "latency_ns": 66941, + "mem_footprint_GB": 127.99999904632568, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 2, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.csv new file mode 100644 index 0000000..355b00a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Compute,915,915,500,0,0,0,0,0,0,0,0,915,45,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,915,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.286496174863388,268.3837557099556,0.0058777609404417515,0.4473062595165927,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,366,500,0,0,0,0,0,0,0,0,366,11,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,366,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.0013445378151260505,0.10371208190917969,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,0.0004261736363176793,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,6,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.00892857142857143,0.08275111516316731,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,636,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.027275112724331474,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Compute,1006,1006,500,0,0,0,0,0,0,0,0,1006,51,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,1006,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0011451292246520875,2.1496332189434804,2.3493492824565215e-05,0.0035827220315724673,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Compute,915,915,500,0,0,0,0,0,0,0,0,915,45,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,915,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.286496174863388,269.165455969305,0.0058777609404417515,0.448609093282175,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Compute,6035,6035,3262,0,0,0,0,0,0,0,0,6035,365,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,6035,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.34749826014913004,324.26506996944903,0.007129280875564807,0.5404417832824151,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Compute,6035,6035,3262,0,0,0,0,0,0,0,0,6035,365,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,6035,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.34749826014913004,324.26506996944903,0.007129280875564807,0.5404417832824151,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Compute,915,915,500,0,0,0,0,0,0,0,0,915,45,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,915,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002238251366120219,4.171103727622111,4.5920007347201183e-05,0.006951839546036851,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.json new file mode 100644 index 0000000..492e79c --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_chip0.json @@ -0,0 +1,137 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 66941, + "overlapped_compute_time_non_pp_ns": 8938, + "compute_only_time_non_pp_ns": 7297, + "memory_only_time_non_pp_ns": 4086, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 66941, + "overlapped_compute_time_chip_ns": 8938, + "compute_only_time_chip_ns": 7297, + "memory_only_time_chip_ns": 4086, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 14938.527957455073, + "latency_ns": 66941, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 2, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_config_stats.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_config_stats.json new file mode 100644 index 0000000..c3bfabb --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_config_stats.json @@ -0,0 +1,102 @@ +{ + "ICI_bisection_BW_per_chip_GBps": -125.0, + "topology": [], + "table_sharding": [ + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + } + ] +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv new file mode 100644 index 0000000..7c72566 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,341,500,0,0,0,0,0,0,0,0,341,34,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,341,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.0041263700838168925,0.5457136366102431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,137,500,0,0,0,0,0,0,0,0,137,8,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,137,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,92,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.0005157962604771116,0.06914138793945312,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,0.0002841157575451195,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.018183408482887648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,375,500,0,0,0,0,0,0,0,0,375,38,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,375,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,1.8133462282398455e-05,0.004805624485015869,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,341,500,0,0,0,0,0,0,0,0,341,34,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,341,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.0041263700838168925,0.5473030938042535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Compute,2247,2247,2175,0,0,0,0,0,0,0,0,2247,272,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,2247,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9333119715175789,870.9121928195927,0.0073455631220594435,0.967680214243992,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Compute,2247,2247,2175,0,0,0,0,0,0,0,0,2247,272,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,2247,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9333119715175789,870.9121928195927,0.0073455631220594435,0.967680214243992,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,341,500,0,0,0,0,0,0,0,0,341,34,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,341,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,3.223726627981947e-05,0.008481244246164957,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json new file mode 100644 index 0000000..741de57 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json @@ -0,0 +1,139 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 57614, + "overlapped_compute_time_non_pp_ns": 5925, + "compute_only_time_non_pp_ns": 144, + "memory_only_time_non_pp_ns": 4925, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 57614, + "overlapped_compute_time_chip_ns": 5925, + "compute_only_time_chip_ns": 144, + "memory_only_time_chip_ns": 4925, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 17356.892421980767, + "latency_ns": 57614, + "mem_footprint_GB": 255.99999904632568, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 2, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.csv new file mode 100644 index 0000000..7c72566 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,341,500,0,0,0,0,0,0,0,0,341,34,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,341,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.0041263700838168925,0.5457136366102431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,137,500,0,0,0,0,0,0,0,0,137,8,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,137,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,92,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.0005157962604771116,0.06914138793945312,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,0.0002841157575451195,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,5,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.006648936170212767,0.05516741010877821,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.018183408482887648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,375,500,0,0,0,0,0,0,0,0,375,38,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,375,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,1.8133462282398455e-05,0.004805624485015869,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,341,500,0,0,0,0,0,0,0,0,341,34,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,341,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.0041263700838168925,0.5473030938042535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Compute,2247,2247,2175,0,0,0,0,0,0,0,0,2247,272,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,2247,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9333119715175789,870.9121928195927,0.0073455631220594435,0.967680214243992,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Compute,2247,2247,2175,0,0,0,0,0,0,0,0,2247,272,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,2247,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9333119715175789,870.9121928195927,0.0073455631220594435,0.967680214243992,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,341,500,0,0,0,0,0,0,0,0,341,34,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,341,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,3.223726627981947e-05,0.008481244246164957,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.json new file mode 100644 index 0000000..4aeb2db --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_chip0.json @@ -0,0 +1,137 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 57614, + "overlapped_compute_time_non_pp_ns": 5925, + "compute_only_time_non_pp_ns": 144, + "memory_only_time_non_pp_ns": 4925, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 57614, + "overlapped_compute_time_chip_ns": 5925, + "compute_only_time_chip_ns": 144, + "memory_only_time_chip_ns": 4925, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 17356.892421980767, + "latency_ns": 57614, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 2, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_config_stats.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_config_stats.json new file mode 100644 index 0000000..77aaf6c --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_config_stats.json @@ -0,0 +1,102 @@ +{ + "ICI_bisection_BW_per_chip_GBps": -164.0, + "topology": [], + "table_sharding": [ + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + } + ] +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv new file mode 100644 index 0000000..3d51e7f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,153,30,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,153,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.0018754578754578755,0.4092852274576823,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,96,500,0,0,0,0,0,0,0,0,96,7,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,96,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.00023443223443223444,0.051856040954589844,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,0.00021308681815883964,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.013637556362165737,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,168,500,0,0,0,0,0,0,0,0,168,34,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,168,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,8.241758241758243e-06,0.003604218363761902,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,153,30,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,153,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.0018754578754578755,0.4104773203531901,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Memory,1631,1006,1631,0,0,0,0,0,0,0,0,1006,243,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,1006,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,365,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2858074800735744,1199.8404029832159,0.004599528817799818,0.9998670024860132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Memory,1631,1006,1631,0,0,0,0,0,0,0,0,1006,243,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,1006,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,365,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2858074800735744,1199.8404029832159,0.004599528817799818,0.9998670024860132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,153,30,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,153,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,1.4652014652014653e-05,0.006360933184623718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json new file mode 100644 index 0000000..4f53c2d --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json @@ -0,0 +1,139 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 56382, + "overlapped_compute_time_non_pp_ns": 2767, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 6995, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 56382, + "overlapped_compute_time_chip_ns": 2767, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 6995, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 17736.156929516514, + "latency_ns": 56382, + "mem_footprint_GB": 255.99999904632568, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.csv new file mode 100644 index 0000000..3d51e7f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,153,30,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,153,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.0018754578754578755,0.4092852274576823,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,96,500,0,0,0,0,0,0,0,0,96,7,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,96,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.00023443223443223444,0.051856040954589844,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,0.00021308681815883964,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,4,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.005952380952380953,0.04137555758158366,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.013637556362165737,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,168,500,0,0,0,0,0,0,0,0,168,34,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,168,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,8.241758241758243e-06,0.003604218363761902,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,153,30,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,153,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.0018754578754578755,0.4104773203531901,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Memory,1631,1006,1631,0,0,0,0,0,0,0,0,1006,243,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,1006,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,365,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2858074800735744,1199.8404029832159,0.004599528817799818,0.9998670024860132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Memory,1631,1006,1631,0,0,0,0,0,0,0,0,1006,243,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,1006,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,365,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2858074800735744,1199.8404029832159,0.004599528817799818,0.9998670024860132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,153,30,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,153,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,1.4652014652014653e-05,0.006360933184623718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.json new file mode 100644 index 0000000..5787f89 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_chip0.json @@ -0,0 +1,137 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 56382, + "overlapped_compute_time_non_pp_ns": 2767, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 6995, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 56382, + "overlapped_compute_time_chip_ns": 2767, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 6995, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 17736.156929516514, + "latency_ns": 56382, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_config_stats.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_config_stats.json new file mode 100644 index 0000000..79e780a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_config_stats.json @@ -0,0 +1,102 @@ +{ + "ICI_bisection_BW_per_chip_GBps": -112.0, + "topology": [], + "table_sharding": [ + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + } + ] +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv new file mode 100644 index 0000000..8cf4717 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,95,500,0,0,0,0,0,0,0,0,95,12,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,95,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.0011495285136955545,0.1776283084807301,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,59,500,0,0,0,0,0,0,0,0,59,3,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,59,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.00014369106421194432,0.022505334229840076,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,9.24789084233662e-05,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.005918650139095437,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,104,500,0,0,0,0,0,0,0,0,104,14,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,104,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,5.051638976201168e-06,0.0015642177347248761,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,95,500,0,0,0,0,0,0,0,0,95,12,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,95,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.0011495285136955545,0.1781456724860138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Memory,708,622,708,0,0,0,0,0,0,0,0,622,100,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,622,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9620790960451977,2764.039120431674,0.006494511376811043,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Memory,708,622,708,0,0,0,0,0,0,0,0,622,100,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,622,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9620790960451977,2764.039120431674,0.006494511376811043,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,95,500,0,0,0,0,0,0,0,0,95,12,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,95,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,8.98069151324652e-06,0.002760621996943386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json new file mode 100644 index 0000000..829a60a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json @@ -0,0 +1,139 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 54536, + "overlapped_compute_time_non_pp_ns": 1708, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 6208, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 54536, + "overlapped_compute_time_chip_ns": 1708, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 6208, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 18336.511662021414, + "latency_ns": 54536, + "mem_footprint_GB": 759.9999990463257, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.csv new file mode 100644 index 0000000..8cf4717 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,95,500,0,0,0,0,0,0,0,0,95,12,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,8,1,1,256,512,0,95,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.0011495285136955545,0.1776283084807301,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,59,500,0,0,0,0,0,0,0,0,59,3,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,2,1,1,64,256,0,59,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,0.00014369106421194432,0.022505334229840076,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,9.24789084233662e-05,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,2,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0024509803921568627,0.017956842350054392,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.005918650139095437,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,104,500,0,0,0,0,0,0,0,0,104,14,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,104,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,5.051638976201168e-06,0.0015642177347248761,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,95,500,0,0,0,0,0,0,0,0,95,12,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,8,1,1,1024,128,0,95,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.0011495285136955545,0.1781456724860138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Memory,708,622,708,0,0,0,0,0,0,0,0,622,100,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,622,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9620790960451977,2764.039120431674,0.006494511376811043,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Memory,708,622,708,0,0,0,0,0,0,0,0,622,100,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,64,1,1,1024,1024,0,622,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9620790960451977,2764.039120431674,0.006494511376811043,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,95,500,0,0,0,0,0,0,0,0,95,12,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,8,1,1,1,1024,0,95,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,8.98069151324652e-06,0.002760621996943386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.json new file mode 100644 index 0000000..634982b --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_chip0.json @@ -0,0 +1,137 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 54536, + "overlapped_compute_time_non_pp_ns": 1708, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 6208, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 54536, + "overlapped_compute_time_chip_ns": 1708, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 6208, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 18336.511662021414, + "latency_ns": 54536, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_config_stats.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_config_stats.json new file mode 100644 index 0000000..a09d9de --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_config_stats.json @@ -0,0 +1,102 @@ +{ + "ICI_bisection_BW_per_chip_GBps": -200.0, + "topology": [], + "table_sharding": [ + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + } + ] +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv new file mode 100644 index 0000000..7e94d89 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,4,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,2,1,1,256,512,0,100,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.00024806201550387597,0.06637057742557011,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,2,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,1,1,1,64,256,0,100,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,3.1007751937984497e-05,0.008409087722365921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,3.4554619160892914e-05,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.0022114956262971465,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,176,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,1.0901162790697674e-06,0.0005844678427722003,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,8,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,4,1,1,1024,128,0,100,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.00024806201550387597,0.0665638897870038,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Memory,500,288,500,0,0,0,0,0,0,0,0,288,32,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,16,1,1,1024,1024,0,288,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3913.87939453125,0.0019844961240310078,0.5289026208826013,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Memory,500,288,500,0,0,0,0,0,0,0,0,288,32,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,16,1,1,1024,1024,0,288,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3913.87939453125,0.0019844961240310078,0.5289026208826013,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,8,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,4,1,1,1,1024,0,100,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,1.937984496124031e-06,0.00103150267858763,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json new file mode 100644 index 0000000..d19d344 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json @@ -0,0 +1,139 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 54120, + "overlapped_compute_time_non_pp_ns": 1160, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 6340, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 54120, + "overlapped_compute_time_chip_ns": 1160, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 6340, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 18477.457501847744, + "latency_ns": 54120, + "mem_footprint_GB": 1535.9999990463257, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.csv b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.csv new file mode 100644 index 0000000..7e94d89 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,BottomMLP_0_einsum,"XlaEinsum(a=1x512,b=512x256,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP0einsumbottommlp0einsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,4,0,0,263680,"DT_BFLOAT16:[1,512],DT_BFLOAT16:[512,256]","[DT_BFLOAT16:(1,256)]",262144,BottomMLP0einsumbottommlp0einsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 512], [512, 256], [1, 256]]",1,263680,2,1,1,256,512,0,100,263680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,491.14227294921875,0.00024806201550387597,0.06637057742557011,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,BottomMLP_1_einsum,"XlaEinsum(a=1x256,b=256x64,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",BottomMLP1einsumbottommlp1einsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,2,0,0,33408,"DT_BFLOAT16:[1,256],DT_BFLOAT16:[256,64]","[DT_BFLOAT16:(1,64)]",32768,BottomMLP1einsumbottommlp1einsum,Einsum,32768,[],Einsum,"BI,IO->BO","[[1, 256], [256, 64], [1, 64]]",1,33408,1,1,1,64,256,0,100,33408,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,62.22724914550781,3.1007751937984497e-05,0.008409087722365921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,EmbeddingIndicesAllToAll-6,AllToAll(1x8x100->1x8x100),EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,3200,3200,0,0,0,0,0,0,0,0,0,6400,"DT_BFLOAT16:[1,8,100]","[DT_BFLOAT16:(1,8,100)]",0,EmbeddingIndicesAllToAll6EmbeddingIndicesAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,6400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.25570418179060755,0.0,3.4554619160892914e-05,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,EmbeddingBag7,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_7,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_7,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,EmbeddingBag14,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_14,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_14,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,EmbeddingBag28,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_28,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_28,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,EmbeddingBag56,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_56,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_56,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,EmbeddingBag112,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_112,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_112,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,EmbeddingBag224,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_224,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_224,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,EmbeddingBag448,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_448,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_448,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +896,EmbeddingBag896,"EmbeddingBag(batch=1,table_sizes=[10000000],num_indices=[100],embedding_dim=64,reduction=sum,memory_placements=[0, 0],type=DT_FLOAT)",embedding_bag_896,VPU,1,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,26656,"DT_INT64:[1,1,100],DT_FLOAT:[10000000,64]","[DT_FLOAT:(1,1,64)]",12800,embedding_bag_896,EmbeddingBag,0,[],EmbeddingBag,,,,,0,,,,,0,1,26656,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0256,49.65066909790039,0.0015625,0.006709549878094648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +903,EmbeddingAllToAll-903,AllToAll(1x8x100x64->1x8x100x64),EmbeddingAllToAll903EmbeddingAllToAll,ICINoCompute,1,ICI/NVLink,23310,0,500,23310,204800,204800,0,0,0,0,0,0,0,0,0,409600,"DT_BFLOAT16:[1,8,100,64]","[DT_BFLOAT16:(1,8,100,64)]",0,EmbeddingAllToAll903EmbeddingAllToAll,AllToAll,0,[],AllToAll,,,,,0,,,,,0,0,409600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,16.365067634598883,0.0,0.0022114956262971465,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +904,DotInteraction_einsum,"XlaEinsum(a=1x9x64,b=1x64x9,eq=BNE;BEN->BNN,memory_placements=0_0_0,type=DT_BFLOAT16)",DotInteractioneinsumDotInteractioneinsum,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,2322,"DT_BFLOAT16:[1,9,64],DT_BFLOAT16:[1,64,9]","[DT_BFLOAT16:(1,9,9)]",1152,DotInteractioneinsumDotInteractioneinsum,Einsum,1152,[],Einsum,"BNE,BEN->BNN","[[1, 9, 64], [1, 64, 9], [1, 9, 9]]",1,258,9,9,1,1,64,0,176,2322,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.002304,4.325062036514282,1.0901162790697674e-06,0.0005844678427722003,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +1808,TopMLP_init_einsum,"XlaEinsum(a=1x128,b=128x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLPiniteinsumtopmlpiniteinsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,8,0,0,264448,"DT_BFLOAT16:[1,128],DT_BFLOAT16:[128,1024]","[DT_BFLOAT16:(1,1024)]",262144,TopMLPiniteinsumtopmlpiniteinsum,Einsum,262144,[],Einsum,"BI,IO->BO","[[1, 128], [128, 1024], [1, 1024]]",1,264448,4,1,1,1024,128,0,100,264448,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.524288,492.5727844238281,0.00024806201550387597,0.0665638897870038,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3616,TopMLP_0_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP0einsumtopmlp0einsum,MXU,1,Memory,500,288,500,0,0,0,0,0,0,0,0,288,32,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP0einsumtopmlp0einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,16,1,1,1024,1024,0,288,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3913.87939453125,0.0019844961240310078,0.5289026208826013,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7232,TopMLP_1_einsum,"XlaEinsum(a=1x1024,b=1024x1024,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP1einsumtopmlp1einsum,MXU,1,Memory,500,288,500,0,0,0,0,0,0,0,0,288,32,0,0,2101248,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1024]","[DT_BFLOAT16:(1,1024)]",2097152,TopMLP1einsumtopmlp1einsum,Einsum,2097152,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1024], [1, 1024]]",1,2101248,16,1,1,1024,1024,0,288,2101248,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3913.87939453125,0.0019844961240310078,0.5289026208826013,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14464,TopMLP_2_einsum,"XlaEinsum(a=1x1024,b=1024x1,eq=BI;IO->BO,memory_placements=0_0_0,type=DT_BFLOAT16)",TopMLP2einsumtopmlp2einsum,MXU,1,Memory,500,100,500,0,0,0,0,0,0,0,0,100,8,0,0,4098,"DT_BFLOAT16:[1,1024],DT_BFLOAT16:[1024,1]","[DT_BFLOAT16:(1,1)]",2048,TopMLP2einsumtopmlp2einsum,Einsum,2048,[],Einsum,"BI,IO->BO","[[1, 1024], [1024, 1], [1, 1]]",1,4098,4,1,1,1,1024,0,100,4098,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,7.633119821548462,1.937984496124031e-06,0.00103150267858763,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.json new file mode 100644 index 0000000..6ef2ac7 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_chip0.json @@ -0,0 +1,137 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 54120, + "overlapped_compute_time_non_pp_ns": 1160, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 6340, + "ici_bound_time_non_pp_ns": 46620, + "total_execution_time_chip_ns": 54120, + "overlapped_compute_time_chip_ns": 1160, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 6340, + "ici_bound_time_chip_ns": 46620, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 18477.457501847744, + "latency_ns": 54120, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": false, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "dlrm-s", + "model_type": "dlrm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "embedding_dim": 64, + "num_indices_per_lookup": [ + 100, + 100, + 100, + 100, + 100, + 100, + 100, + 100 + ], + "embedding_table_sizes": [ + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000, + 10000000 + ], + "num_dense_features": 1, + "bottom_mlp_config": [ + { + "in_features": 512, + "out_features": 256, + "bias": true, + "activation": "relu" + }, + { + "in_features": 256, + "out_features": 64, + "bias": true, + "activation": "relu" + } + ], + "top_mlp_config": [ + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1024, + "bias": true, + "activation": "relu" + }, + { + "in_features": 1024, + "out_features": 1, + "bias": true, + "activation": "relu" + } + ], + "interaction": "dot", + "num_pods": 1, + "batch_size_per_pod": 1 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_config_stats.json b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_config_stats.json new file mode 100644 index 0000000..af80271 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/dlrm-s/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_config_stats.json @@ -0,0 +1,102 @@ +{ + "ICI_bisection_BW_per_chip_GBps": -300.0, + "topology": [], + "table_sharding": [ + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + }, + { + "embed_dim": 64, + "num_entries": 10000000, + "num_indices_per_lookup": 100, + "row_sharding_degree": 1, + "column_sharding_degree": 1, + "data_parallel_degree": 1, + "chip_assignment": [ + 0 + ], + "batch_size": 1 + } + ] +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv new file mode 100644 index 0000000..9e39ff8 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv @@ -0,0 +1,635 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Time-Embed-MLP-FFi2,"XlaEinsum(a=1x320,b=320x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFi2einsum,MXU,1,Memory,1277,285,1277,0,0,0,0,0,0,0,0,0,285,0,0,822400,"DT_BFLOAT16:[1,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,1280)]",819200,TimeEmbedMLPFFi2einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 320], [320, 1280], [1, 1280]]",1,822400,30,1,1,1280,320,0,285,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6415035238841034,599.7804897132103,0.013161098425274574,0.9996341495220171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Time-Embed-MLP-FFo2,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFo2einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPFFo2einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Conv2d5Conv2d,"Conv2D(a=1x3x64x64,b=3x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d5Conv2dconv2d,MXU,1,Compute,8960,8960,4134,0,0,0,0,0,0,0,0,8960,548,0,0,2663296,"DT_BFLOAT16:[1,3,64,64],DT_BFLOAT16:[3,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",70778880,Conv2d5Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 3, 64, 64], [3, 320, 3, 3], [1, 320, 64, 64]]",1,2664856,96,1,320,4096,27,0,8960,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1552,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8994285714285715,276.82898300034657,0.16206482593037216,0.46138163833391094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Time-Embed-MLP-Einsum6,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum6einsum,MXU,1,Memory,1277,285,1277,0,0,0,0,0,0,0,0,0,285,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum6einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,285,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6415035238841034,599.7804897132103,0.013161098425274574,0.9996341495220171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Conv2d-GroupNorm7,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm7XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm7XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Conv2d7Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d7Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d7Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Conv2d-GroupNorm9,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm9XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm9XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Conv2d9Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d9Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d9Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,SpatialTransformer-Input_GroupNorm11,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm11XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm11XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,SpatialTransformer-Proj_in12,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin12einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin12einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,BasicTransformerBlock-Input_layernorm13,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm13XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm13XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-Q-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Q14MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14Q14MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-K-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14K14MatMulK,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14K14MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-V-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14V14MatMulV,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14V14MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,SelfAttention14-FlashAttention-15,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention14FlashAttention15FlashAttention,MXU,1,Compute,1498332,1498332,16277,0,0,0,0,0,0,0,0,1498332,280867,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention14FlashAttention15FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,187245,1498332,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,188994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.224811462346129,6.517664309378696,0.06616029293481915,0.010862773848964493,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,SelfAttention14-Attention_output-16,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,SelfAttention14-Attention_layernorm-17,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention14Attentionlayernorm17YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention14Attentionlayernorm17YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,GatedSelfAttention-Linear18,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear18XLinearcontext,MXU,1,Memory,790,171,790,0,0,0,0,0,0,0,0,0,171,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear18XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,171,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.977417721518988,599.9697914606409,0.10211679608552283,0.9999496524344015,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-Q-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Q19MatMulQ,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18Q19MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-K-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18K19MatMulK,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18K19MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-V-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18V19MatMulV,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18V19MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,GatedSelfAttention-Attn18-FlashAttention-20,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn18FlashAttention20FlashAttention,MXU,1,Compute,1593418,1593418,16308,0,0,0,0,0,0,0,0,1593418,287542,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn18FlashAttention20FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,187978,1593418,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.044230081497761,6.140697849734423,0.0624554819109802,0.010234496416224037,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,GatedSelfAttention-Attn18-Attention_output-21,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,GatedSelfAttention-Attn18-Attention_layernorm-22,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,VPU,1,Memory,8154,3665,8154,0,0,0,0,0,0,0,0,0,3665,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3665,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1769,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288476821192053,599.9937752224752,0.44938505203405865,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,GatedSelfAttention-FFN18Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,BasicTransformerBlock-Fuser_output_layernorm26,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-Q-27,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Q27MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention27Q27MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-K-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27K27MatMulK,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27K27MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-V-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27V27MatMulV,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27V27MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,CrossAttention27-FlashAttention-28,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention27FlashAttention28FlashAttention,MXU,1,Compute,187612,187612,9156,0,0,0,0,0,0,0,0,187612,35107,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention27FlashAttention28FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,23405,187612,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24409,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2193024753214083,29.279385447092938,0.06604727045285846,0.0487989757451549,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,CrossAttention27-Attention_output-29,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,CrossAttention27-Attention_layernorm-30,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention27Attentionlayernorm30YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention27Attentionlayernorm30YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +31,BasicTransformerBlock-Attn_output_layernorm31,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +32,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +33,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +34,BasicTransformerBlock-FFN32Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +35,SpatialTransformer-Proj_out35,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout35einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout35einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +36,Time-Embed-MLP-Einsum36,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum36einsum,MXU,1,Memory,1277,285,1277,0,0,0,0,0,0,0,0,0,285,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum36einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,285,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6415035238841034,599.7804897132103,0.013161098425274574,0.9996341495220171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +37,Conv2d-GroupNorm37,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm37XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm37XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +38,Conv2d37Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d37Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d37Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +39,Conv2d-GroupNorm39,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm39XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm39XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +40,Conv2d39Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d39Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d39Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +41,SpatialTransformer-Input_GroupNorm41,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm41XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm41XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +42,SpatialTransformer-Proj_in42,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin42einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin42einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +43,BasicTransformerBlock-Input_layernorm43,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm43XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm43XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-Q-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Q44MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44Q44MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-K-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44K44MatMulK,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44K44MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-V-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44V44MatMulV,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44V44MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +45,SelfAttention44-FlashAttention-45,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention44FlashAttention45FlashAttention,MXU,1,Compute,1498332,1498332,16277,0,0,0,0,0,0,0,0,1498332,280867,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention44FlashAttention45FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,187245,1498332,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,188994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.224811462346129,6.517664309378696,0.06616029293481915,0.010862773848964493,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +46,SelfAttention44-Attention_output-46,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +47,SelfAttention44-Attention_layernorm-47,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention44Attentionlayernorm47YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention44Attentionlayernorm47YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +48,GatedSelfAttention-Linear48,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear48XLinearcontext,MXU,1,Memory,790,171,790,0,0,0,0,0,0,0,0,0,171,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear48XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,171,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.977417721518988,599.9697914606409,0.10211679608552283,0.9999496524344015,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-Q-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Q49MatMulQ,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48Q49MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-K-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48K49MatMulK,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48K49MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-V-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48V49MatMulV,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48V49MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +50,GatedSelfAttention-Attn48-FlashAttention-50,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn48FlashAttention50FlashAttention,MXU,1,Compute,1593418,1593418,16308,0,0,0,0,0,0,0,0,1593418,287542,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn48FlashAttention50FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,187978,1593418,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.044230081497761,6.140697849734423,0.0624554819109802,0.010234496416224037,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +51,GatedSelfAttention-Attn48-Attention_output-51,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +52,GatedSelfAttention-Attn48-Attention_layernorm-52,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,VPU,1,Memory,8154,3665,8154,0,0,0,0,0,0,0,0,0,3665,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3665,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1769,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288476821192053,599.9937752224752,0.44938505203405865,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +53,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +54,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +55,GatedSelfAttention-FFN48Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,BasicTransformerBlock-Fuser_output_layernorm56,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-Q-57,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Q57MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention57Q57MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-K-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57K57MatMulK,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57K57MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-V-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57V57MatMulV,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57V57MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +58,CrossAttention57-FlashAttention-58,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention57FlashAttention58FlashAttention,MXU,1,Compute,187612,187612,9156,0,0,0,0,0,0,0,0,187612,35107,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention57FlashAttention58FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,23405,187612,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24409,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2193024753214083,29.279385447092938,0.06604727045285846,0.0487989757451549,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +59,CrossAttention57-Attention_output-59,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +60,CrossAttention57-Attention_layernorm-60,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention57Attentionlayernorm60YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention57Attentionlayernorm60YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +61,BasicTransformerBlock-Attn_output_layernorm61,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +62,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +63,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +64,BasicTransformerBlock-FFN62Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +65,SpatialTransformer-Proj_out65,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout65einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout65einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +66,Downsample-Conv2d66Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d66Conv2dconv2d,MXU,1,Compute,50652,50652,7948,0,0,0,0,0,0,0,0,50652,3154,0,0,5120000,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,32,32)]",1887436800,DownsampleConv2d66Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 32, 32]]",1,5286400,552,1,320,1024,2880,0,50652,5120000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.26282871357498,94.13984802241274,0.7644848984369869,0.15689974670402124,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +67,Time-Embed-MLP-Einsum67,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum67einsum,MXU,1,Memory,2550,571,2550,0,0,0,0,0,0,0,0,0,571,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum67einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,571,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6425098039215686,599.7863470339307,0.013181743285549516,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +68,Conv2d-GroupNorm68,"GroupNorm(x=1x320x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm68XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,320,32,32]","[DT_BFLOAT16:(1,320,32,32)]",2621440,Conv2dGroupNorm68XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +69,Conv2d68Conv2d,"Conv2D(a=1x320x32x32,b=320x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d68Conv2dconv2d,MXU,1,Compute,84298,84298,8774,0,0,0,0,0,0,0,0,84298,5257,0,0,5652480,"DT_BFLOAT16:[1,320,32,32],DT_BFLOAT16:[320,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",3774873600,Conv2d68Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 320, 32, 32], [320, 640, 3, 3], [1, 640, 32, 32]]",1,5736960,920,1,640,1024,2880,0,84298,5652480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,44.78010866212722,62.44848307863176,0.9187095559949289,0.10408080513105293,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +70,Conv2d-GroupNorm70,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm70XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm70XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +71,Conv2d70Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d70Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d70Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +72,SkipConnection-Einsum67,"XlaEinsum(a=1x32x32x320,b=320x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum67einsum,MXU,1,Compute,11155,11155,3688,0,0,0,0,0,0,0,0,11155,685,0,0,2375680,"DT_BFLOAT16:[1,32,32,320],DT_BFLOAT16:[320,640]","[DT_BFLOAT16:(1,32,32,640)]",419430400,SkipConnectionEinsum67einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 320], [320, 640], [1, 32, 32, 640]]",1,2375680,120,1,1024,640,320,0,11155,2375680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1780,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.600215150156885,198.34373949462125,0.7714067249490564,0.3305728991577021,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +73,SpatialTransformer-Input_GroupNorm73,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm73XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm73XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +74,SpatialTransformer-Proj_in74,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin74einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin74einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +75,BasicTransformerBlock-Input_layernorm75,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm75XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm75XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-Q-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Q76MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76Q76MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-K-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76K76MatMulK,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76K76MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-V-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76V76MatMulV,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76V76MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +77,SelfAttention76-FlashAttention-77,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention76FlashAttention77FlashAttention,MXU,1,Compute,93990,93990,8139,0,0,0,0,0,0,0,0,93990,17552,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention76FlashAttention77FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,11702,93990,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.213000191509735,51.95034046175125,0.06591797267901735,0.08658390076958543,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +78,SelfAttention76-Attention_output-78,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +79,SelfAttention76-Attention_layernorm-79,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention76Attentionlayernorm79YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention76Attentionlayernorm79YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +80,GatedSelfAttention-Linear80,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear80XLinearcontext,MXU,1,Memory,1561,342,1561,0,0,0,0,0,0,0,0,0,342,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear80XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,342,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.03800128122998,599.9411779057896,0.10335972954204102,0.999901963176316,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-Q-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Q81MatMulQ,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80Q81MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-K-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80K81MatMulK,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80K81MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-V-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80V81MatMulV,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80V81MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +82,GatedSelfAttention-Attn80-FlashAttention-82,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn80FlashAttention82FlashAttention,MXU,1,Compute,118858,118858,8202,0,0,0,0,0,0,0,0,118858,19291,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn80FlashAttention82FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,11887,118858,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15715,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.580616466708173,41.402004683372176,0.05294397622415337,0.06900334113895362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +83,GatedSelfAttention-Attn80-Attention_output-83,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +84,GatedSelfAttention-Attn80-Attention_layernorm-84,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,VPU,1,Memory,4101,1843,4101,0,0,0,0,0,0,0,0,0,1843,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1843,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884272128749084,599.9706745496525,0.44936775002612594,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +85,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +86,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +87,GatedSelfAttention-FFN80Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +88,BasicTransformerBlock-Fuser_output_layernorm88,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-Q-89,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Q89MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention89Q89MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-K-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89K89MatMulK,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89K89MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-V-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89V89MatMulV,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89V89MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +90,CrossAttention89-FlashAttention-90,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention89FlashAttention90FlashAttention,MXU,1,Compute,47178,47178,6104,0,0,0,0,0,0,0,0,47178,8775,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention89FlashAttention90FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,5851,47178,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6535,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.20053719954216,77.62324335495359,0.06566228170016578,0.12937207225825598,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +91,CrossAttention89-Attention_output-91,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +92,CrossAttention89-Attention_layernorm-92,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention89Attentionlayernorm92YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention89Attentionlayernorm92YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +93,BasicTransformerBlock-Attn_output_layernorm93,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +94,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +95,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +96,BasicTransformerBlock-FFN94Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +97,SpatialTransformer-Proj_out97,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout97einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout97einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +98,Time-Embed-MLP-Einsum98,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum98einsum,MXU,1,Memory,2550,571,2550,0,0,0,0,0,0,0,0,0,571,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum98einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,571,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6425098039215686,599.7863470339307,0.013181743285549516,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +99,Conv2d-GroupNorm99,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm99XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm99XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +100,Conv2d99Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d99Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d99Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +101,Conv2d-GroupNorm101,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm101XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm101XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +102,Conv2d101Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d101Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d101Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +103,SpatialTransformer-Input_GroupNorm103,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm103XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm103XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +104,SpatialTransformer-Proj_in104,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin104einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin104einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +105,BasicTransformerBlock-Input_layernorm105,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm105XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm105XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-Q-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Q106MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106Q106MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-K-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106K106MatMulK,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106K106MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-V-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106V106MatMulV,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106V106MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +107,SelfAttention106-FlashAttention-107,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention106FlashAttention107FlashAttention,MXU,1,Compute,93990,93990,8139,0,0,0,0,0,0,0,0,93990,17552,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention106FlashAttention107FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,11702,93990,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.213000191509735,51.95034046175125,0.06591797267901735,0.08658390076958543,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +108,SelfAttention106-Attention_output-108,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +109,SelfAttention106-Attention_layernorm-109,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention106Attentionlayernorm109YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention106Attentionlayernorm109YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +110,GatedSelfAttention-Linear110,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear110XLinearcontext,MXU,1,Memory,1561,342,1561,0,0,0,0,0,0,0,0,0,342,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear110XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,342,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.03800128122998,599.9411779057896,0.10335972954204102,0.999901963176316,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-Q-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Q111MatMulQ,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110Q111MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-K-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110K111MatMulK,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110K111MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-V-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110V111MatMulV,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110V111MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,GatedSelfAttention-Attn110-FlashAttention-112,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn110FlashAttention112FlashAttention,MXU,1,Compute,118858,118858,8202,0,0,0,0,0,0,0,0,118858,19291,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn110FlashAttention112FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,11887,118858,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15715,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.580616466708173,41.402004683372176,0.05294397622415337,0.06900334113895362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +113,GatedSelfAttention-Attn110-Attention_output-113,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +114,GatedSelfAttention-Attn110-Attention_layernorm-114,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,VPU,1,Memory,4101,1843,4101,0,0,0,0,0,0,0,0,0,1843,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1843,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884272128749084,599.9706745496525,0.44936775002612594,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +115,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +116,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +117,GatedSelfAttention-FFN110Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +118,BasicTransformerBlock-Fuser_output_layernorm118,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-Q-119,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Q119MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention119Q119MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-K-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119K119MatMulK,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119K119MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-V-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119V119MatMulV,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119V119MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +120,CrossAttention119-FlashAttention-120,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention119FlashAttention120FlashAttention,MXU,1,Compute,47178,47178,6104,0,0,0,0,0,0,0,0,47178,8775,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention119FlashAttention120FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,5851,47178,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6535,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.20053719954216,77.62324335495359,0.06566228170016578,0.12937207225825598,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +121,CrossAttention119-Attention_output-121,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +122,CrossAttention119-Attention_layernorm-122,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention119Attentionlayernorm122YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention119Attentionlayernorm122YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +123,BasicTransformerBlock-Attn_output_layernorm123,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +124,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +125,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +126,BasicTransformerBlock-FFN124Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +127,SpatialTransformer-Proj_out127,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout127einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout127einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +128,Downsample-Conv2d128Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d128Conv2dconv2d,MXU,1,Compute,41326,41326,13988,0,0,0,0,0,0,0,0,41326,2571,0,0,9011200,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,16,16)]",1887436800,DownsampleConv2d128Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 16, 16]]",1,9180160,450,1,640,256,5760,0,41326,9011200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6629,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.67189662682089,203.07636800984852,0.9370054947401214,0.33846061334974753,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +129,Time-Embed-MLP-Einsum129,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum129einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum129einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +130,Conv2d-GroupNorm130,"GroupNorm(x=1x640x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm130XnormGroupNormX,VPU,1,Memory,1018,458,1018,0,0,0,0,0,0,0,0,0,458,0,0,655360,"DT_BFLOAT16:[1,640,16,16]","[DT_BFLOAT16:(1,640,16,16)]",1310720,Conv2dGroupNorm130XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,458,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +131,Conv2d130Conv2d,"Conv2D(a=1x640x16x16,b=640x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d130Conv2dconv2d,MXU,1,Compute,82469,82469,24415,0,0,0,0,0,0,0,0,82469,5142,0,0,15728640,"DT_BFLOAT16:[1,640,16,16],DT_BFLOAT16:[640,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",3774873600,Conv2d130Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 640, 16, 16], [640, 1280, 3, 3], [1, 1280, 16, 16]]",1,15815680,900,1,1280,256,5760,0,82469,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12863,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.77324327929283,177.62356158071518,0.9390847245784539,0.29603926930119195,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +132,Conv2d-GroupNorm132,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm132XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm132XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +133,Conv2d132Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d132Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d132Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +134,SkipConnection-Einsum129,"XlaEinsum(a=1x16x16x640,b=640x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum129einsum,MXU,1,Compute,9326,9326,4070,0,0,0,0,0,0,0,0,9326,571,0,0,2621440,"DT_BFLOAT16:[1,16,16,640],DT_BFLOAT16:[640,1280]","[DT_BFLOAT16:(1,16,16,1280)]",419430400,SkipConnectionEinsum129einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 640], [640, 1280], [1, 16, 16, 1280]]",1,2621440,100,1,256,1280,640,0,9326,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1591,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,44.974308385159766,261.78492923010936,0.9226937611845082,0.4363082153835156,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +135,SpatialTransformer-Input_GroupNorm135,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm135XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm135XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +136,SpatialTransformer-Proj_in136,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin136einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin136einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +137,BasicTransformerBlock-Input_layernorm137,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm137XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm137XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-Q-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Q138MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138Q138MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-K-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138K138MatMulK,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138K138MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-V-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138V138MatMulV,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138V138MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +139,SelfAttention138-FlashAttention-139,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention138FlashAttention139FlashAttention,MXU,1,Compute,12070,12070,4070,0,0,0,0,0,0,0,0,12070,1461,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention138FlashAttention139FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,731,12070,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1934,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.5637421706710855,202.27060894780448,0.03208176394004164,0.33711768157967414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +140,SelfAttention138-Attention_output-140,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +141,SelfAttention138-Attention_layernorm-141,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention138Attentionlayernorm141YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention138Attentionlayernorm141YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +142,GatedSelfAttention-Linear142,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear142XLinearcontext,MXU,1,Memory,3103,685,3103,0,0,0,0,0,0,0,0,0,685,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear142XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,685,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.068849500483403,599.926608322591,0.1039926121915089,0.9998776805376518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-Q-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Q143MatMulQ,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142Q143MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-K-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142K143MatMulK,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142K143MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-V-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142V143MatMulV,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142V143MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +144,GatedSelfAttention-Attn142-FlashAttention-144,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn142FlashAttention144FlashAttention,MXU,1,Compute,26698,26698,4197,0,0,0,0,0,0,0,0,26698,2422,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn142FlashAttention144FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,778,26698,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7518333957599821,94.302951356375,0.015424628162749109,0.15717158559395833,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +145,GatedSelfAttention-Attn142-Attention_output-145,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +146,GatedSelfAttention-Attn142-Attention_layernorm-146,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,VPU,1,Memory,2099,943,2099,0,0,0,0,0,0,0,0,0,943,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,943,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287927584564078,599.7380169872558,0.44919349350030624,0.9995633616454264,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +147,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +148,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +149,GatedSelfAttention-FFN142Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +150,BasicTransformerBlock-Fuser_output_layernorm150,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-Q-151,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Q151MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention151Q151MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-K-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151K151MatMulK,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151K151MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-V-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151V151MatMulV,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151V151MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +152,CrossAttention151-FlashAttention-152,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention151FlashAttention152FlashAttention,MXU,1,Compute,23772,23772,6104,0,0,0,0,0,0,0,0,23772,2924,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention151FlashAttention152FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1462,23772,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3610,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.58794952044422,154.05137872286724,0.03257840238568926,0.2567522978714454,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +153,CrossAttention151-Attention_output-153,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +154,CrossAttention151-Attention_layernorm-154,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention151Attentionlayernorm154YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention151Attentionlayernorm154YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +155,BasicTransformerBlock-Attn_output_layernorm155,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +156,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +157,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +158,BasicTransformerBlock-FFN156Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +159,SpatialTransformer-Proj_out159,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout159einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout159einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +160,Time-Embed-MLP-Einsum160,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum160einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum160einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +161,Conv2d-GroupNorm161,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm161XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm161XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +162,Conv2d161Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d161Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d161Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +163,Conv2d-GroupNorm163,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm163XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm163XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +164,Conv2d163Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d163Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d163Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +165,SpatialTransformer-Input_GroupNorm165,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm165XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm165XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +166,SpatialTransformer-Proj_in166,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin166einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin166einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +167,BasicTransformerBlock-Input_layernorm167,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm167XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm167XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-Q-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Q168MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168Q168MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-K-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168K168MatMulK,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168K168MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-V-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168V168MatMulV,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168V168MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +169,SelfAttention168-FlashAttention-169,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention168FlashAttention169FlashAttention,MXU,1,Compute,12070,12070,4070,0,0,0,0,0,0,0,0,12070,1461,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention168FlashAttention169FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,731,12070,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1934,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.5637421706710855,202.27060894780448,0.03208176394004164,0.33711768157967414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +170,SelfAttention168-Attention_output-170,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +171,SelfAttention168-Attention_layernorm-171,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention168Attentionlayernorm171YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention168Attentionlayernorm171YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +172,GatedSelfAttention-Linear172,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear172XLinearcontext,MXU,1,Memory,3103,685,3103,0,0,0,0,0,0,0,0,0,685,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear172XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,685,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.068849500483403,599.926608322591,0.1039926121915089,0.9998776805376518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-Q-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Q173MatMulQ,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172Q173MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-K-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172K173MatMulK,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172K173MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-V-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172V173MatMulV,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172V173MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +174,GatedSelfAttention-Attn172-FlashAttention-174,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn172FlashAttention174FlashAttention,MXU,1,Compute,26698,26698,4197,0,0,0,0,0,0,0,0,26698,2422,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn172FlashAttention174FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,778,26698,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7518333957599821,94.302951356375,0.015424628162749109,0.15717158559395833,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +175,GatedSelfAttention-Attn172-Attention_output-175,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +176,GatedSelfAttention-Attn172-Attention_layernorm-176,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,VPU,1,Memory,2099,943,2099,0,0,0,0,0,0,0,0,0,943,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,943,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287927584564078,599.7380169872558,0.44919349350030624,0.9995633616454264,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +177,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +178,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +179,GatedSelfAttention-FFN172Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +180,BasicTransformerBlock-Fuser_output_layernorm180,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-Q-181,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Q181MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention181Q181MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-K-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181K181MatMulK,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181K181MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-V-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181V181MatMulV,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181V181MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +182,CrossAttention181-FlashAttention-182,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention181FlashAttention182FlashAttention,MXU,1,Compute,23772,23772,6104,0,0,0,0,0,0,0,0,23772,2924,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention181FlashAttention182FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1462,23772,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3610,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.58794952044422,154.05137872286724,0.03257840238568926,0.2567522978714454,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +183,CrossAttention181-Attention_output-183,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +184,CrossAttention181-Attention_layernorm-184,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention181Attentionlayernorm184YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention181Attentionlayernorm184YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +185,BasicTransformerBlock-Attn_output_layernorm185,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +186,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +187,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +188,BasicTransformerBlock-FFN186Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +189,SpatialTransformer-Proj_out189,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout189einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout189einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +190,Downsample-Conv2d190Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d190Conv2dconv2d,MXU,1,Compute,82469,82469,47048,0,0,0,0,0,0,0,0,82469,5142,0,0,30310400,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,DownsampleConv2d190Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,24420352,900,1,1280,64,11520,0,82469,30310400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15231,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,342.2954051295032,0.46954236228922697,0.5704923418825053,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +191,Time-Embed-MLP-Einsum191,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum191einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum191einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +192,Conv2d-GroupNorm192,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm192XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm192XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +193,Conv2d192Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d192Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d192Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +194,Conv2d-GroupNorm194,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm194XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm194XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +195,Conv2d194Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d194Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d194Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +196,Time-Embed-MLP-Einsum196,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum196einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum196einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +197,Conv2d-GroupNorm197,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm197XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm197XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +198,Conv2d197Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d197Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d197Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +199,Conv2d-GroupNorm199,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm199XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm199XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +200,Conv2d199Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d199Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d199Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +201,Time-Embed-MLP-Einsum201,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum201einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum201einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +202,Conv2d-GroupNorm202,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm202XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm202XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +203,Conv2d202Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d202Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d202Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +204,Conv2d-GroupNorm204,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm204XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm204XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +205,Conv2d204Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d204Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d204Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +206,SpatialTransformer-Input_GroupNorm206,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm206XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,SpatialTransformerInputGroupNorm206XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +207,SpatialTransformer-Proj_in207,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin207einsum,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjin207einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +208,BasicTransformerBlock-Input_layernorm208,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm208XnormLayerNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockInputlayernorm208XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-Q-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Q209MatMulQ,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209Q209MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-K-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209K209MatMulK,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209K209MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-V-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209V209MatMulV,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209V209MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +210,SelfAttention209-FlashAttention-210,"FlashAttention(q=1x64x8x160,k=1x64x8x160,v=1x64x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention209FlashAttention210FlashAttention,MXU,1,Memory,17294,3292,17294,0,0,0,0,0,0,0,0,3292,227,0,0,11141120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160]","[DT_BFLOAT16:(1,64,8,8)]",1179648,SelfAttention209FlashAttention210FlashAttention,FlashAttention,0,[],FlashAttention,,"[64, 64]",,11141120,32,8,64,64,160,45,3292,11141120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.06821140279865849,599.9755153521453,0.001399426429528675,0.9999591922535754,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +211,SelfAttention209-Attention_output-211,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +212,SelfAttention209-Attention_layernorm-212,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention209Attentionlayernorm212YnormLayerNormy,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,SelfAttention209Attentionlayernorm212YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +213,GatedSelfAttention-Linear213,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear213XLinearcontext,MXU,1,Memory,3103,685,3103,0,0,0,0,0,0,0,0,0,685,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear213XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,685,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.068849500483403,599.926608322591,0.1039926121915089,0.9998776805376518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-Q-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Q214MatMulQ,MXU,1,Compute,9326,9326,5659,0,0,0,0,0,0,0,0,9326,571,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213Q214MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,9326,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.29804846665237,364.04466721062084,0.5190152406662859,0.6067411120177014,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-K-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213K214MatMulK,MXU,1,Compute,9326,9326,5659,0,0,0,0,0,0,0,0,9326,571,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213K214MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,9326,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.29804846665237,364.04466721062084,0.5190152406662859,0.6067411120177014,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-V-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213V214MatMulV,MXU,1,Compute,9326,9326,5659,0,0,0,0,0,0,0,0,9326,571,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213V214MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,9326,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.29804846665237,364.04466721062084,0.5190152406662859,0.6067411120177014,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +215,GatedSelfAttention-Attn213-FlashAttention-215,"FlashAttention(q=1x72x8x160,k=1x72x8x160,v=1x72x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn213FlashAttention215FlashAttention,MXU,1,Memory,21744,3292,21744,0,0,0,0,0,0,0,0,3292,240,0,0,14008320,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160]","[DT_BFLOAT16:(1,72,8,8)]",1492992,GatedSelfAttentionAttn213FlashAttention215FlashAttention,FlashAttention,0,[],FlashAttention,,"[72, 72]",,14008320,32,8,72,72,160,58,3292,14008320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2686,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.06866225165562914,599.9937752224752,0.0014086760532027382,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +216,GatedSelfAttention-Attn213-Attention_output-216,"XlaEinsum(a=1x72x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,MXU,1,Compute,9326,9326,5659,0,0,0,0,0,0,0,0,9326,571,0,0,3645440,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,72,1280)]",235929600,GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 72, 8, 160], [8, 160, 1280], [1, 72, 1280]]",1,2953216,100,1,72,1280,1280,0,9326,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1757,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.29804846665237,364.04466721062084,0.5190152406662859,0.6067411120177014,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +217,GatedSelfAttention-Attn213-Attention_layernorm-217,"LayerNorm(x=1x72x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,VPU,1,Memory,573,258,573,0,0,0,0,0,0,0,0,0,258,0,0,368640,"DT_BFLOAT16:[1,72,1280]","[DT_BFLOAT16:(1,72,1280)]",737280,GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,258,368640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2867015706806282,599.1671097840314,0.44876589379207177,0.9986118496400522,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +218,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,36755,36755,21617,0,0,0,0,0,0,0,0,36755,2285,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,36755,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.82303904230717,352.87636248469596,0.468237900520023,0.5881272708078266,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +219,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,36755,36755,21617,0,0,0,0,0,0,0,0,36755,2285,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,36755,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.82303904230717,352.87636248469596,0.468237900520023,0.5881272708078266,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +220,GatedSelfAttention-FFN213Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,509,29,509,0,0,0,0,0,0,0,0,0,29,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,29,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16094302554027506,599.5594916502947,0.056132472635419595,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +221,BasicTransformerBlock-Fuser_output_layernorm221,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-Q-222,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Q222MatMulQ,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,CrossAttention222Q222MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-K-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222K222MatMulK,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222K222MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-V-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222V222MatMulV,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222V222MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +223,CrossAttention222-FlashAttention-223,"FlashAttention(q=1x64x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention222FlashAttention223FlashAttention,MXU,1,Compute,12070,12070,4578,0,0,0,0,0,0,0,0,12070,1095,0,0,2949120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,64,8,8)]",9437184,CrossAttention222FlashAttention223FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,128,8,512,64,160,365,12070,2949120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1987,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7818710853355427,227.55443506628004,0.01604088197002082,0.3792573917771334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,CrossAttention222-Attention_output-224,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +225,CrossAttention222-Attention_layernorm-225,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention222Attentionlayernorm225YnormLayerNormy,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,CrossAttention222Attentionlayernorm225YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +226,BasicTransformerBlock-Attn_output_layernorm226,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +227,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,36755,36755,21617,0,0,0,0,0,0,0,0,36755,2285,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,36755,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.82303904230717,352.87636248469596,0.468237900520023,0.5881272708078266,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +228,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,36755,36755,21617,0,0,0,0,0,0,0,0,36755,2285,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,36755,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.82303904230717,352.87636248469596,0.468237900520023,0.5881272708078266,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +229,BasicTransformerBlock-FFN227Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,509,29,509,0,0,0,0,0,0,0,0,0,29,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,29,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16094302554027506,599.5594916502947,0.056132472635419595,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +230,SpatialTransformer-Proj_out230,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout230einsum,MXU,1,Compute,9326,9326,5595,0,0,0,0,0,0,0,0,9326,571,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjout230einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,9326,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1751,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.487154192579883,359.95427769140036,0.4613468805922541,0.5999237961523339,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +231,Time-Embed-MLP-Einsum231,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum231einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum231einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +232,Conv2d-GroupNorm232,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm232XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm232XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +233,Conv2d232Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d232Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d232Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +234,Conv2d-GroupNorm234,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm234XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm234XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +235,Conv2d234Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d234Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d234Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +236,Time-Embed-MLP-Einsum236,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum236einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum236einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +237,Conv2d-GroupNorm237,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm237XnormGroupNormX,VPU,1,Memory,1018,458,1018,0,0,0,0,0,0,0,0,0,458,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm237XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,458,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +238,Conv2d237Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d237Conv2dconv2d,MXU,1,Compute,164755,164755,92316,0,0,0,0,0,0,0,0,164755,10285,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d237Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 16, 10, 10], [16, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,164755,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30253,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.912042730114415,336.19255437998845,0.4700639018619193,0.5603209239666475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +239,Conv2d-GroupNorm239,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm239XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm239XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +240,Conv2d239Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d239Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d239Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +241,SkipConnection-Einsum236,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum236einsum,MXU,1,Compute,18469,18469,10936,0,0,0,0,0,0,0,0,18469,1142,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum236einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,18469,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.709968054577942,355.25904471682276,0.4659181339978734,0.5920984078613712,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +242,Time-Embed-MLP-Einsum242,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum242einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum242einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +243,Conv2d-GroupNorm243,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm243XnormGroupNormX,VPU,1,Memory,1018,458,1018,0,0,0,0,0,0,0,0,0,458,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm243XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,458,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +244,Conv2d243Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d243Conv2dconv2d,MXU,1,Compute,164755,164755,92316,0,0,0,0,0,0,0,0,164755,10285,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d243Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 16, 10, 10], [16, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,164755,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30253,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.912042730114415,336.19255437998845,0.4700639018619193,0.5603209239666475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +245,Conv2d-GroupNorm245,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm245XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm245XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +246,Conv2d245Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d245Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d245Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +247,SkipConnection-Einsum242,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum242einsum,MXU,1,Compute,18469,18469,10936,0,0,0,0,0,0,0,0,18469,1142,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum242einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,18469,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.709968054577942,355.25904471682276,0.4659181339978734,0.5920984078613712,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +248,Time-Embed-MLP-Einsum248,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum248einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum248einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +249,Conv2d-GroupNorm249,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm249XnormGroupNormX,VPU,1,Memory,1018,458,1018,0,0,0,0,0,0,0,0,0,458,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm249XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,458,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +250,Conv2d249Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d249Conv2dconv2d,MXU,1,Compute,164755,164755,92316,0,0,0,0,0,0,0,0,164755,10285,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d249Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 16, 10, 10], [16, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,164755,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30253,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.912042730114415,336.19255437998845,0.4700639018619193,0.5603209239666475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +251,Conv2d-GroupNorm251,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm251XnormGroupNormX,VPU,1,Memory,509,229,509,0,0,0,0,0,0,0,0,0,229,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm251XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,229,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2875442043222005,599.5594916502947,0.44905978108335676,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +252,Conv2d251Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d251Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d251Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +253,SkipConnection-Einsum248,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum248einsum,MXU,1,Compute,18469,18469,10936,0,0,0,0,0,0,0,0,18469,1142,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum248einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,18469,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.709968054577942,355.25904471682276,0.4659181339978734,0.5920984078613712,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample254,"Upsample(a=1x1280x8x8,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample254Upsample,VPU,1,Memory,1272,0,1272,0,0,0,0,0,0,0,0,0,0,0,0,819200,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,16,16)]",0,Upsample254Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,819200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,599.7951675511006,0.0,0.9996586125851676,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample-Conv2d254Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d254Conv2dconv2d,MXU,1,Compute,82469,82469,46285,0,0,0,0,0,0,0,0,82469,5142,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,UpsampleConv2d254Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,82469,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.886621639646414,336.74466883010587,0.46954236228922697,0.5612411147168431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +255,Time-Embed-MLP-Einsum255,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum255einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum255einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +256,Conv2d-GroupNorm256,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm256XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm256XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +257,Conv2d256Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d256Conv2dconv2d,MXU,1,Compute,329326,329326,94605,0,0,0,0,0,0,0,0,329326,20571,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d256Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 1, 18, 18], [1, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,329326,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,51064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.849688150950726,172.3602002650869,0.9406530690107736,0.28726700044181147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +258,Conv2d-GroupNorm258,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm258XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm258XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +259,Conv2d258Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d258Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d258Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +260,SkipConnection-Einsum255,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum255einsum,MXU,1,Compute,36755,36755,13225,0,0,0,0,0,0,0,0,36755,2285,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum255einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,36755,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.64607808461434,215.8773041082846,0.936475801040046,0.359795506847141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +261,SpatialTransformer-Input_GroupNorm261,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm261XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm261XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +262,SpatialTransformer-Proj_in262,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin262einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin262einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +263,BasicTransformerBlock-Input_layernorm263,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm263XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm263XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-Q-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Q264MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264Q264MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-K-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264K264MatMulK,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264K264MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-V-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264V264MatMulV,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264V264MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +265,SelfAttention264-FlashAttention-265,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention264FlashAttention265FlashAttention,MXU,1,Compute,12070,12070,4070,0,0,0,0,0,0,0,0,12070,1461,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention264FlashAttention265FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,731,12070,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1934,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.5637421706710855,202.27060894780448,0.03208176394004164,0.33711768157967414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +266,SelfAttention264-Attention_output-266,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +267,SelfAttention264-Attention_layernorm-267,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention264Attentionlayernorm267YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention264Attentionlayernorm267YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +268,GatedSelfAttention-Linear268,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear268XLinearcontext,MXU,1,Memory,3103,685,3103,0,0,0,0,0,0,0,0,0,685,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear268XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,685,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.068849500483403,599.926608322591,0.1039926121915089,0.9998776805376518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-Q-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Q269MatMulQ,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268Q269MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-K-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268K269MatMulK,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268K269MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-V-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268V269MatMulV,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268V269MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +270,GatedSelfAttention-Attn268-FlashAttention-270,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn268FlashAttention270FlashAttention,MXU,1,Compute,26698,26698,4197,0,0,0,0,0,0,0,0,26698,2422,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn268FlashAttention270FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,778,26698,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7518333957599821,94.302951356375,0.015424628162749109,0.15717158559395833,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +271,GatedSelfAttention-Attn268-Attention_output-271,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +272,GatedSelfAttention-Attn268-Attention_layernorm-272,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,VPU,1,Memory,2099,943,2099,0,0,0,0,0,0,0,0,0,943,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,943,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287927584564078,599.7380169872558,0.44919349350030624,0.9995633616454264,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +273,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +274,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +275,GatedSelfAttention-FFN268Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +276,BasicTransformerBlock-Fuser_output_layernorm276,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-Q-277,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Q277MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention277Q277MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-K-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277K277MatMulK,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277K277MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-V-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277V277MatMulV,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277V277MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +278,CrossAttention277-FlashAttention-278,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention277FlashAttention278FlashAttention,MXU,1,Compute,23772,23772,6104,0,0,0,0,0,0,0,0,23772,2924,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention277FlashAttention278FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1462,23772,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3610,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.58794952044422,154.05137872286724,0.03257840238568926,0.2567522978714454,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +279,CrossAttention277-Attention_output-279,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +280,CrossAttention277-Attention_layernorm-280,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention277Attentionlayernorm280YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention277Attentionlayernorm280YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +281,BasicTransformerBlock-Attn_output_layernorm281,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +282,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +283,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +284,BasicTransformerBlock-FFN282Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +285,SpatialTransformer-Proj_out285,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout285einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout285einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +286,Time-Embed-MLP-Einsum286,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum286einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum286einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +287,Conv2d-GroupNorm287,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm287XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm287XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +288,Conv2d287Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d287Conv2dconv2d,MXU,1,Compute,329326,329326,94605,0,0,0,0,0,0,0,0,329326,20571,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d287Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 1, 18, 18], [1, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,329326,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,51064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.849688150950726,172.3602002650869,0.9406530690107736,0.28726700044181147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +289,Conv2d-GroupNorm289,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm289XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm289XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +290,Conv2d289Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d289Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d289Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +291,SkipConnection-Einsum286,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum286einsum,MXU,1,Compute,36755,36755,13225,0,0,0,0,0,0,0,0,36755,2285,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum286einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,36755,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.64607808461434,215.8773041082846,0.936475801040046,0.359795506847141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +292,SpatialTransformer-Input_GroupNorm292,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm292XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm292XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +293,SpatialTransformer-Proj_in293,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin293einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin293einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +294,BasicTransformerBlock-Input_layernorm294,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm294XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm294XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-Q-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Q295MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295Q295MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-K-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295K295MatMulK,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295K295MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-V-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295V295MatMulV,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295V295MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +296,SelfAttention295-FlashAttention-296,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention295FlashAttention296FlashAttention,MXU,1,Compute,12070,12070,4070,0,0,0,0,0,0,0,0,12070,1461,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention295FlashAttention296FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,731,12070,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1934,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.5637421706710855,202.27060894780448,0.03208176394004164,0.33711768157967414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +297,SelfAttention295-Attention_output-297,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +298,SelfAttention295-Attention_layernorm-298,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention295Attentionlayernorm298YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention295Attentionlayernorm298YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +299,GatedSelfAttention-Linear299,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear299XLinearcontext,MXU,1,Memory,3103,685,3103,0,0,0,0,0,0,0,0,0,685,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear299XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,685,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.068849500483403,599.926608322591,0.1039926121915089,0.9998776805376518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-Q-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Q300MatMulQ,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299Q300MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-K-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299K300MatMulK,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299K300MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-V-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299V300MatMulV,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299V300MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +301,GatedSelfAttention-Attn299-FlashAttention-301,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn299FlashAttention301FlashAttention,MXU,1,Compute,26698,26698,4197,0,0,0,0,0,0,0,0,26698,2422,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn299FlashAttention301FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,778,26698,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7518333957599821,94.302951356375,0.015424628162749109,0.15717158559395833,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +302,GatedSelfAttention-Attn299-Attention_output-302,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +303,GatedSelfAttention-Attn299-Attention_layernorm-303,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,VPU,1,Memory,2099,943,2099,0,0,0,0,0,0,0,0,0,943,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,943,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287927584564078,599.7380169872558,0.44919349350030624,0.9995633616454264,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +304,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +305,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +306,GatedSelfAttention-FFN299Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +307,BasicTransformerBlock-Fuser_output_layernorm307,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-Q-308,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Q308MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention308Q308MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-K-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308K308MatMulK,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308K308MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-V-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308V308MatMulV,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308V308MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +309,CrossAttention308-FlashAttention-309,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention308FlashAttention309FlashAttention,MXU,1,Compute,23772,23772,6104,0,0,0,0,0,0,0,0,23772,2924,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention308FlashAttention309FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1462,23772,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3610,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.58794952044422,154.05137872286724,0.03257840238568926,0.2567522978714454,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +310,CrossAttention308-Attention_output-310,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +311,CrossAttention308-Attention_layernorm-311,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention308Attentionlayernorm311YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention308Attentionlayernorm311YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +312,BasicTransformerBlock-Attn_output_layernorm312,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +313,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +314,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +315,BasicTransformerBlock-FFN313Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +316,SpatialTransformer-Proj_out316,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout316einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout316einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +317,Time-Embed-MLP-Einsum317,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum317einsum,MXU,1,Memory,5095,1142,5095,0,0,0,0,0,0,0,0,0,1142,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum317einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1142,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,533,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6431403336604514,599.9070037452466,0.013194679245594216,0.9998450062420776,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +318,Conv2d-GroupNorm318,"GroupNorm(x=1x1920x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm318XnormGroupNormX,VPU,1,Memory,3052,1372,3052,0,0,0,0,0,0,0,0,0,1372,0,0,1966080,"DT_BFLOAT16:[1,1920,16,16]","[DT_BFLOAT16:(1,1920,16,16)]",3932160,Conv2dGroupNorm318XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1372,1966080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,662,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883879423328966,599.9523877785059,0.44935405354802477,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +319,Conv2d318Conv2d,"Conv2D(a=1x1920x16x16,b=1920x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d318Conv2dconv2d,MXU,1,Compute,247041,247041,71208,0,0,0,0,0,0,0,0,247041,15428,0,0,45875200,"DT_BFLOAT16:[1,1920,16,16],DT_BFLOAT16:[1920,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",11324620800,Conv2d318Conv2dconv2d,Conv2D,44236800,[],Conv2D,bf01;io01->bf01,"[[1, 1, 18, 18], [1, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,2700,1,1280,256,17280,0,247041,45875200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38330,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.84105796203869,172.94541948502476,0.9404760118918783,0.2882423658083746,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +320,Conv2d-GroupNorm320,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm320XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm320XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +321,Conv2d320Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d320Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d320Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +322,SkipConnection-Einsum317,"XlaEinsum(a=1x16x16x1920,b=1920x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum317einsum,MXU,1,Compute,27612,27612,10173,0,0,0,0,0,0,0,0,27612,1714,0,0,6553600,"DT_BFLOAT16:[1,16,16,1920],DT_BFLOAT16:[1920,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1258291200,SkipConnectionEinsum317einsum,Einsum,4915200,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 1920], [1920, 1280], [1, 16, 16, 1280]]",1,3801088,300,1,256,1280,1920,0,27612,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4515,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.57044763146458,221.04576361726785,0.9349241652332381,0.36840960602877976,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +323,SpatialTransformer-Input_GroupNorm323,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm323XnormGroupNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm323XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +324,SpatialTransformer-Proj_in324,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin324einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin324einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +325,BasicTransformerBlock-Input_layernorm325,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm325XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm325XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-Q-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Q326MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326Q326MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-K-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326K326MatMulK,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326K326MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-V-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326V326MatMulV,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326V326MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +327,SelfAttention326-FlashAttention-327,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention326FlashAttention327FlashAttention,MXU,1,Compute,12070,12070,4070,0,0,0,0,0,0,0,0,12070,1461,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention326FlashAttention327FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,731,12070,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1934,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.5637421706710855,202.27060894780448,0.03208176394004164,0.33711768157967414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +328,SelfAttention326-Attention_output-328,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +329,SelfAttention326-Attention_layernorm-329,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention326Attentionlayernorm329YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention326Attentionlayernorm329YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +330,GatedSelfAttention-Linear330,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear330XLinearcontext,MXU,1,Memory,3103,685,3103,0,0,0,0,0,0,0,0,0,685,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear330XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,685,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.068849500483403,599.926608322591,0.1039926121915089,0.9998776805376518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-Q-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Q331MatMulQ,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330Q331MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-K-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330K331MatMulK,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330K331MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-V-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330V331MatMulV,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330V331MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +332,GatedSelfAttention-Attn330-FlashAttention-332,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn330FlashAttention332FlashAttention,MXU,1,Compute,26698,26698,4197,0,0,0,0,0,0,0,0,26698,2422,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn330FlashAttention332FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,778,26698,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3776,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7518333957599821,94.302951356375,0.015424628162749109,0.15717158559395833,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +333,GatedSelfAttention-Attn330-Attention_output-333,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,MXU,1,Compute,27612,27612,7185,0,0,0,0,0,0,0,0,27612,1714,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,27612,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.3296827466319,156.11357055469543,0.6427603635978512,0.2601892842578257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +334,GatedSelfAttention-Attn330-Attention_layernorm-334,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,VPU,1,Memory,2099,943,2099,0,0,0,0,0,0,0,0,0,943,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,943,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287927584564078,599.7380169872558,0.44919349350030624,0.9995633616454264,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +335,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +336,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +337,GatedSelfAttention-FFN330Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +338,BasicTransformerBlock-Fuser_output_layernorm338,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-Q-339,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Q339MatMulQ,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention339Q339MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-K-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339K339MatMulK,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339K339MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-V-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339V339MatMulV,MXU,1,Compute,22126,22126,6307,0,0,0,0,0,0,0,0,22126,1371,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339V339MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,22126,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.495478622435144,171.0286399484769,0.9333860996265089,0.28504773324746147,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +340,CrossAttention339-FlashAttention-340,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention339FlashAttention340FlashAttention,MXU,1,Compute,23772,23772,6104,0,0,0,0,0,0,0,0,23772,2924,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention339FlashAttention340FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1462,23772,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3610,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.58794952044422,154.05137872286724,0.03257840238568926,0.2567522978714454,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +341,CrossAttention339-Attention_output-341,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +342,CrossAttention339-Attention_layernorm-342,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention339Attentionlayernorm342YnormLayerNormy,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention339Attentionlayernorm342YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +343,BasicTransformerBlock-Attn_output_layernorm343,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,VPU,1,Memory,2035,915,2035,0,0,0,0,0,0,0,0,0,915,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,915,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +344,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +345,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,25432,0,0,0,0,0,0,0,0,73326,4571,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,73326,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,208.09520582740092,0.9388257389528106,0.3468253430456682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +346,BasicTransformerBlock-FFN344Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,115,2035,0,0,0,0,0,0,0,0,0,115,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,115,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +347,SpatialTransformer-Proj_out347,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout347einsum,MXU,1,Compute,18469,18469,7121,0,0,0,0,0,0,0,0,18469,1142,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout347einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,18469,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3053,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,231.33147097839623,0.9318362679957468,0.3855524516306604,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample348,"Upsample(a=1x1280x16x16,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample348Upsample,VPU,1,Memory,5087,0,5087,0,0,0,0,0,0,0,0,0,0,0,0,3276800,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,32,32)]",0,Upsample348Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,3276800,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,599.9130749950855,0.0,0.9998551249918092,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample-Conv2d348Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d348Conv2dconv2d,MXU,1,Compute,164755,164755,47811,0,0,0,0,0,0,0,0,164755,10285,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,UpsampleConv2d348Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,164755,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,174.11625405905738,0.9401278037238386,0.2901937567650956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +349,Time-Embed-MLP-Einsum349,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum349einsum,MXU,1,Memory,2550,571,2550,0,0,0,0,0,0,0,0,0,571,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum349einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,571,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6425098039215686,599.7863470339307,0.013181743285549516,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +350,Conv2d-GroupNorm350,"GroupNorm(x=1x1920x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm350XnormGroupNormX,VPU,1,Memory,12208,5486,12208,0,0,0,0,0,0,0,0,0,5486,0,0,7864320,"DT_BFLOAT16:[1,1920,32,32]","[DT_BFLOAT16:(1,1920,32,32)]",15728640,Conv2dGroupNorm350XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,5486,7864320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2648,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883879423328966,599.9523877785059,0.44935405354802477,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +351,Conv2d350Conv2d,"Conv2D(a=1x1920x32x32,b=1920x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d350Conv2dconv2d,MXU,1,Compute,493898,493898,42471,0,0,0,0,0,0,0,0,493898,30857,0,0,27361280,"DT_BFLOAT16:[1,1920,32,32],DT_BFLOAT16:[1920,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",22649241600,Conv2d350Conv2dconv2d,Conv2D,22118400,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 32, 32], [1920, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,5400,1,640,1024,17280,0,493898,27361280,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66181,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.85813589040652,51.594008751553964,0.9408263829931749,0.08599001458592327,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +352,Conv2d-GroupNorm352,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm352XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm352XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +353,Conv2d352Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d352Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d352Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +354,SkipConnection-Einsum349,"XlaEinsum(a=1x32x32x1920,b=1920x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum349einsum,MXU,1,Compute,55040,55040,11953,0,0,0,0,0,0,0,0,55040,3428,0,0,7700480,"DT_BFLOAT16:[1,32,32,1920],DT_BFLOAT16:[1920,640]","[DT_BFLOAT16:(1,32,32,640)]",2516582400,SkipConnectionEinsum349einsum,Einsum,2457600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1920], [1920, 640], [1, 32, 32, 640]]",1,4718592,600,1,1024,640,1920,0,55040,7700480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.72279069767442,130.29852578806322,0.9380496384600353,0.21716420964677202,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +355,SpatialTransformer-Input_GroupNorm355,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm355XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm355XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +356,SpatialTransformer-Proj_in356,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin356einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin356einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +357,BasicTransformerBlock-Input_layernorm357,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm357XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm357XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-Q-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Q358MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358Q358MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-K-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358K358MatMulK,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358K358MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-V-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358V358MatMulV,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358V358MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +359,SelfAttention358-FlashAttention-359,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention358FlashAttention359FlashAttention,MXU,1,Compute,93990,93990,8139,0,0,0,0,0,0,0,0,93990,17552,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention358FlashAttention359FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,11702,93990,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.213000191509735,51.95034046175125,0.06591797267901735,0.08658390076958543,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +360,SelfAttention358-Attention_output-360,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +361,SelfAttention358-Attention_layernorm-361,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention358Attentionlayernorm361YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention358Attentionlayernorm361YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +362,GatedSelfAttention-Linear362,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear362XLinearcontext,MXU,1,Memory,1561,342,1561,0,0,0,0,0,0,0,0,0,342,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear362XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,342,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.03800128122998,599.9411779057896,0.10335972954204102,0.999901963176316,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-Q-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Q363MatMulQ,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362Q363MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-K-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362K363MatMulK,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362K363MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-V-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362V363MatMulV,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362V363MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +364,GatedSelfAttention-Attn362-FlashAttention-364,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn362FlashAttention364FlashAttention,MXU,1,Compute,118858,118858,8202,0,0,0,0,0,0,0,0,118858,19291,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn362FlashAttention364FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,11887,118858,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15715,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.580616466708173,41.402004683372176,0.05294397622415337,0.06900334113895362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +365,GatedSelfAttention-Attn362-Attention_output-365,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +366,GatedSelfAttention-Attn362-Attention_layernorm-366,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,VPU,1,Memory,4101,1843,4101,0,0,0,0,0,0,0,0,0,1843,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1843,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884272128749084,599.9706745496525,0.44936775002612594,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +367,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +368,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +369,GatedSelfAttention-FFN362Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +370,BasicTransformerBlock-Fuser_output_layernorm370,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-Q-371,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Q371MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention371Q371MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-K-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371K371MatMulK,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371K371MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-V-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371V371MatMulV,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371V371MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +372,CrossAttention371-FlashAttention-372,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention371FlashAttention372FlashAttention,MXU,1,Compute,47178,47178,6104,0,0,0,0,0,0,0,0,47178,8775,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention371FlashAttention372FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,5851,47178,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6535,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.20053719954216,77.62324335495359,0.06566228170016578,0.12937207225825598,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +373,CrossAttention371-Attention_output-373,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +374,CrossAttention371-Attention_layernorm-374,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention371Attentionlayernorm374YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention371Attentionlayernorm374YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +375,BasicTransformerBlock-Attn_output_layernorm375,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +376,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +377,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +378,BasicTransformerBlock-FFN376Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +379,SpatialTransformer-Proj_out379,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout379einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout379einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +380,Time-Embed-MLP-Einsum380,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum380einsum,MXU,1,Memory,2550,571,2550,0,0,0,0,0,0,0,0,0,571,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum380einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,571,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6425098039215686,599.7863470339307,0.013181743285549516,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +381,Conv2d-GroupNorm381,"GroupNorm(x=1x1280x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm381XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,1280,32,32]","[DT_BFLOAT16:(1,1280,32,32)]",10485760,Conv2dGroupNorm381XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +382,Conv2d381Conv2d,"Conv2D(a=1x1280x32x32,b=1280x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d381Conv2dconv2d,MXU,1,Compute,329326,329326,28992,0,0,0,0,0,0,0,0,329326,20571,0,0,18677760,"DT_BFLOAT16:[1,1280,32,32],DT_BFLOAT16:[1280,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",15099494400,Conv2d381Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 32, 32], [1280, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,3600,1,640,1024,11520,0,329326,18677760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,44199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.849688150950726,52.82006137155888,0.9406530690107736,0.0880334356192648,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +383,Conv2d-GroupNorm383,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm383XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm383XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +384,Conv2d383Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d383Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d383Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +385,SkipConnection-Einsum380,"XlaEinsum(a=1x32x32x1280,b=1280x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum380einsum,MXU,1,Compute,36755,36755,8647,0,0,0,0,0,0,0,0,36755,2285,0,0,5570560,"DT_BFLOAT16:[1,32,32,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,32,32,640)]",1677721600,SkipConnectionEinsum380einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1280], [1280, 640], [1, 32, 32, 640]]",1,4718592,400,1,1024,640,1280,0,36755,5570560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5499,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.64607808461434,141.15054499387838,0.936475801040046,0.23525090832313064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +386,SpatialTransformer-Input_GroupNorm386,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm386XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm386XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +387,SpatialTransformer-Proj_in387,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin387einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin387einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +388,BasicTransformerBlock-Input_layernorm388,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm388XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm388XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-Q-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Q389MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389Q389MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-K-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389K389MatMulK,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389K389MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-V-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389V389MatMulV,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389V389MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +390,SelfAttention389-FlashAttention-390,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention389FlashAttention390FlashAttention,MXU,1,Compute,93990,93990,8139,0,0,0,0,0,0,0,0,93990,17552,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention389FlashAttention390FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,11702,93990,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.213000191509735,51.95034046175125,0.06591797267901735,0.08658390076958543,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +391,SelfAttention389-Attention_output-391,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +392,SelfAttention389-Attention_layernorm-392,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention389Attentionlayernorm392YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention389Attentionlayernorm392YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +393,GatedSelfAttention-Linear393,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear393XLinearcontext,MXU,1,Memory,1561,342,1561,0,0,0,0,0,0,0,0,0,342,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear393XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,342,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.03800128122998,599.9411779057896,0.10335972954204102,0.999901963176316,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-Q-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Q394MatMulQ,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393Q394MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-K-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393K394MatMulK,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393K394MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-V-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393V394MatMulV,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393V394MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +395,GatedSelfAttention-Attn393-FlashAttention-395,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn393FlashAttention395FlashAttention,MXU,1,Compute,118858,118858,8202,0,0,0,0,0,0,0,0,118858,19291,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn393FlashAttention395FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,11887,118858,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15715,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.580616466708173,41.402004683372176,0.05294397622415337,0.06900334113895362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +396,GatedSelfAttention-Attn393-Attention_output-396,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +397,GatedSelfAttention-Attn393-Attention_layernorm-397,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,VPU,1,Memory,4101,1843,4101,0,0,0,0,0,0,0,0,0,1843,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1843,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884272128749084,599.9706745496525,0.44936775002612594,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +398,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +399,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +400,GatedSelfAttention-FFN393Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +401,BasicTransformerBlock-Fuser_output_layernorm401,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-Q-402,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Q402MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention402Q402MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-K-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402K402MatMulK,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402K402MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-V-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402V402MatMulV,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402V402MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +403,CrossAttention402-FlashAttention-403,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention402FlashAttention403FlashAttention,MXU,1,Compute,47178,47178,6104,0,0,0,0,0,0,0,0,47178,8775,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention402FlashAttention403FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,5851,47178,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6535,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.20053719954216,77.62324335495359,0.06566228170016578,0.12937207225825598,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +404,CrossAttention402-Attention_output-404,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +405,CrossAttention402-Attention_layernorm-405,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention402Attentionlayernorm405YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention402Attentionlayernorm405YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +406,BasicTransformerBlock-Attn_output_layernorm406,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +407,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +408,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +409,BasicTransformerBlock-FFN407Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +410,SpatialTransformer-Proj_out410,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout410einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout410einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +411,Time-Embed-MLP-Einsum411,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum411einsum,MXU,1,Memory,2550,571,2550,0,0,0,0,0,0,0,0,0,571,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum411einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,571,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6425098039215686,599.7863470339307,0.013181743285549516,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +412,Conv2d-GroupNorm412,"GroupNorm(x=1x960x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm412XnormGroupNormX,VPU,1,Memory,6104,2743,6104,0,0,0,0,0,0,0,0,0,2743,0,0,3932160,"DT_BFLOAT16:[1,960,32,32]","[DT_BFLOAT16:(1,960,32,32)]",7864320,Conv2dGroupNorm412XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2743,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1324,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883879423328966,599.9523877785059,0.44935405354802477,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +413,Conv2d412Conv2d,"Conv2D(a=1x960x32x32,b=960x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d412Conv2dconv2d,MXU,1,Compute,248869,248869,22253,0,0,0,0,0,0,0,0,248869,15542,0,0,14336000,"DT_BFLOAT16:[1,960,32,32],DT_BFLOAT16:[960,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",11324620800,Conv2d412Conv2dconv2d,Conv2D,11059200,[],Conv2D,bf01;io01->bf01,"[[1, 960, 32, 32], [960, 640, 3, 3], [1, 640, 32, 32]]",1,14589440,2720,1,640,1024,8640,0,248869,14336000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33436,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.50434485612912,53.64846738520065,0.9335679994446134,0.08941411230866775,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +414,Conv2d-GroupNorm414,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm414XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm414XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +415,Conv2d414Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d414Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d414Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +416,SkipConnection-Einsum411,"XlaEinsum(a=1x32x32x960,b=960x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum411einsum,MXU,1,Compute,29441,29441,6994,0,0,0,0,0,0,0,0,29441,1828,0,0,4505600,"DT_BFLOAT16:[1,32,32,960],DT_BFLOAT16:[960,640]","[DT_BFLOAT16:(1,32,32,640)]",1258291200,SkipConnectionEinsum411einsum,Einsum,1228800,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 960], [960, 640], [1, 32, 32, 640]]",1,4505600,320,1,1024,640,960,0,29441,4505600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4411,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,42.739417818688224,142.5280048975069,0.8768427040664437,0.23754667482917816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +417,SpatialTransformer-Input_GroupNorm417,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm417XnormGroupNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm417XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +418,SpatialTransformer-Proj_in418,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin418einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin418einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +419,BasicTransformerBlock-Input_layernorm419,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm419XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm419XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-Q-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Q420MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420Q420MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-K-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420K420MatMulK,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420K420MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-V-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420V420MatMulV,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420V420MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +421,SelfAttention420-FlashAttention-421,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention420FlashAttention421FlashAttention,MXU,1,Compute,93990,93990,8139,0,0,0,0,0,0,0,0,93990,17552,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention420FlashAttention421FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,11702,93990,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.213000191509735,51.95034046175125,0.06591797267901735,0.08658390076958543,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +422,SelfAttention420-Attention_output-422,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +423,SelfAttention420-Attention_layernorm-423,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention420Attentionlayernorm423YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention420Attentionlayernorm423YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +424,GatedSelfAttention-Linear424,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear424XLinearcontext,MXU,1,Memory,1561,342,1561,0,0,0,0,0,0,0,0,0,342,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear424XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,342,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.03800128122998,599.9411779057896,0.10335972954204102,0.999901963176316,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-Q-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Q425MatMulQ,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424Q425MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-K-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424K425MatMulK,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424K425MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-V-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424V425MatMulV,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424V425MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +426,GatedSelfAttention-Attn424-FlashAttention-426,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn424FlashAttention426FlashAttention,MXU,1,Compute,118858,118858,8202,0,0,0,0,0,0,0,0,118858,19291,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn424FlashAttention426FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,11887,118858,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,15715,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.580616466708173,41.402004683372176,0.05294397622415337,0.06900334113895362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +427,GatedSelfAttention-Attn424-Attention_output-427,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,MXU,1,Compute,20755,20755,5373,0,0,0,0,0,0,0,0,20755,1285,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,20755,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,40.73304745844375,155.30807947256685,0.835679971820094,0.2588467991209447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +428,GatedSelfAttention-Attn424-Attention_layernorm-428,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,VPU,1,Memory,4101,1843,4101,0,0,0,0,0,0,0,0,0,1843,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1843,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884272128749084,599.9706745496525,0.44936775002612594,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +429,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +430,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +431,GatedSelfAttention-FFN424Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +432,BasicTransformerBlock-Fuser_output_layernorm432,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-Q-433,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Q433MatMulQ,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention433Q433MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-K-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433K433MatMulK,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433K433MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-V-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433V433MatMulV,MXU,1,Compute,11155,11155,3764,0,0,0,0,0,0,0,0,11155,685,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433V433MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,11155,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.12025818018826,202.44740307037202,0.9256880699388677,0.33741233845062,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +434,CrossAttention433-FlashAttention-434,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention433FlashAttention434FlashAttention,MXU,1,Compute,47178,47178,6104,0,0,0,0,0,0,0,0,47178,8775,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention433FlashAttention434FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,5851,47178,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6535,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.20053719954216,77.62324335495359,0.06566228170016578,0.12937207225825598,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +435,CrossAttention433-Attention_output-435,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +436,CrossAttention433-Attention_layernorm-436,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention433Attentionlayernorm436YnormLayerNormy,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention433Attentionlayernorm436YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +437,BasicTransformerBlock-Attn_output_layernorm437,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,VPU,1,Memory,4070,1829,4070,0,0,0,0,0,0,0,0,0,1829,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1829,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2881769041769042,599.8541154791155,0.4492804492804493,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +438,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +439,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,73326,73326,15259,0,0,0,0,0,0,0,0,73326,4571,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,73326,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.76061969833347,124.85712349644055,0.9388257389528106,0.20809520582740093,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +440,BasicTransformerBlock-FFN438Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,229,4070,0,0,0,0,0,0,0,0,0,229,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,229,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16102211302211303,599.8541154791155,0.05616005616005616,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +441,SpatialTransformer-Proj_out441,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout441einsum,MXU,1,Compute,18469,18469,5341,0,0,0,0,0,0,0,0,18469,1142,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout441einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,18469,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.419936109155884,173.49860323379718,0.9318362679957468,0.2891643387229953,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample442,"Upsample(a=1x640x32x32,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample442Upsample,VPU,1,Memory,10173,0,10173,0,0,0,0,0,0,0,0,0,0,0,0,6553600,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,64,64)]",0,Upsample442Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,599.972046102428,0.0,0.9999534101707133,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample-Conv2d442Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d442Conv2dconv2d,MXU,1,Compute,164755,164755,15514,0,0,0,0,0,0,0,0,164755,10285,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,UpsampleConv2d442Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,164755,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.82408546022883,56.495167540438835,0.9401278037238386,0.09415861256739806,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +443,Time-Embed-MLP-Einsum443,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum443einsum,MXU,1,Memory,1277,285,1277,0,0,0,0,0,0,0,0,0,285,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum443einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,285,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6415035238841034,599.7804897132103,0.013161098425274574,0.9996341495220171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +444,Conv2d-GroupNorm444,"GroupNorm(x=1x960x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm444XnormGroupNormX,VPU,1,Memory,24415,10972,24415,0,0,0,0,0,0,0,0,0,10972,0,0,15728640,"DT_BFLOAT16:[1,960,64,64]","[DT_BFLOAT16:(1,960,64,64)]",31457280,Conv2dGroupNorm444XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,10972,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5297,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884407126766333,599.9769608847021,0.4493724583833124,0.9999616014745034,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +445,Conv2d444Conv2d,"Conv2D(a=1x960x64x64,b=960x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d444Conv2dconv2d,MXU,1,Compute,597029,597029,24860,0,0,0,0,0,0,0,0,597029,37302,0,0,16015360,"DT_BFLOAT16:[1,960,64,64],DT_BFLOAT16:[960,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",22649241600,Conv2d444Conv2dconv2d,Conv2D,5529600,[],Conv2D,bf01;io01->bf01,"[[1, 960, 64, 64], [960, 320, 3, 3], [1, 320, 64, 64]]",1,16514560,6528,1,320,4096,8640,0,597029,16015360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,77229,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.93658532500096,24.98281709698147,0.7783077018160978,0.041638028494969115,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +446,Conv2d-GroupNorm446,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm446XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm446XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +447,Conv2d446Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d446Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d446Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,SkipConnection-Einsum443,"XlaEinsum(a=1x64x64x960,b=960x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum443einsum,MXU,1,Compute,70400,70400,17230,0,0,0,0,0,0,0,0,70400,4388,0,0,11100160,"DT_BFLOAT16:[1,64,64,960],DT_BFLOAT16:[960,320]","[DT_BFLOAT16:(1,64,64,320)]",2516582400,SkipConnectionEinsum443einsum,Einsum,614400,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 960], [960, 320], [1, 64, 64, 320]]",1,11100160,768,1,4096,320,960,0,70400,11100160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,35.74690909090909,146.84417031028053,0.7333842627960275,0.24474028385046756,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +449,SpatialTransformer-Input_GroupNorm449,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm449XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm449XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +450,SpatialTransformer-Proj_in450,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin450einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin450einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +451,BasicTransformerBlock-Input_layernorm451,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm451XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm451XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-Q-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Q452MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452Q452MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-K-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452K452MatMulK,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452K452MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-V-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452V452MatMulV,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452V452MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +453,SelfAttention452-FlashAttention-453,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention452FlashAttention453FlashAttention,MXU,1,Compute,1498332,1498332,16277,0,0,0,0,0,0,0,0,1498332,280867,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention452FlashAttention453FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,187245,1498332,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,188994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.224811462346129,6.517664309378696,0.06616029293481915,0.010862773848964493,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +454,SelfAttention452-Attention_output-454,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +455,SelfAttention452-Attention_layernorm-455,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention452Attentionlayernorm455YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention452Attentionlayernorm455YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +456,GatedSelfAttention-Linear456,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear456XLinearcontext,MXU,1,Memory,790,171,790,0,0,0,0,0,0,0,0,0,171,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear456XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,171,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.977417721518988,599.9697914606409,0.10211679608552283,0.9999496524344015,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-Q-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Q457MatMulQ,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456Q457MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-K-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456K457MatMulK,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456K457MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-V-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456V457MatMulV,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456V457MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +458,GatedSelfAttention-Attn456-FlashAttention-458,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn456FlashAttention458FlashAttention,MXU,1,Compute,1593418,1593418,16308,0,0,0,0,0,0,0,0,1593418,287542,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn456FlashAttention458FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,187978,1593418,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.044230081497761,6.140697849734423,0.0624554819109802,0.010234496416224037,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +459,GatedSelfAttention-Attn456-Attention_output-459,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +460,GatedSelfAttention-Attn456-Attention_layernorm-460,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,VPU,1,Memory,8154,3665,8154,0,0,0,0,0,0,0,0,0,3665,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3665,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1769,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288476821192053,599.9937752224752,0.44938505203405865,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +461,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +462,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +463,GatedSelfAttention-FFN456Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +464,BasicTransformerBlock-Fuser_output_layernorm464,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-Q-465,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Q465MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention465Q465MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-K-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465K465MatMulK,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465K465MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-V-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465V465MatMulV,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465V465MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +466,CrossAttention465-FlashAttention-466,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention465FlashAttention466FlashAttention,MXU,1,Compute,187612,187612,9156,0,0,0,0,0,0,0,0,187612,35107,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention465FlashAttention466FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,23405,187612,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24409,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2193024753214083,29.279385447092938,0.06604727045285846,0.0487989757451549,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +467,CrossAttention465-Attention_output-467,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +468,CrossAttention465-Attention_layernorm-468,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention465Attentionlayernorm468YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention465Attentionlayernorm468YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +469,BasicTransformerBlock-Attn_output_layernorm469,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +470,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +471,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +472,BasicTransformerBlock-FFN470Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +473,SpatialTransformer-Proj_out473,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout473einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout473einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +474,Time-Embed-MLP-Einsum474,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum474einsum,MXU,1,Memory,1277,285,1277,0,0,0,0,0,0,0,0,0,285,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum474einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,285,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6415035238841034,599.7804897132103,0.013161098425274574,0.9996341495220171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +475,Conv2d-GroupNorm475,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm475XnormGroupNormX,VPU,1,Memory,16277,7315,16277,0,0,0,0,0,0,0,0,0,7315,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm475XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,7315,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3531,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884143269644281,599.9646740799901,0.4493632557772141,0.9999411234666502,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +476,Conv2d475Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d475Conv2dconv2d,MXU,1,Compute,395155,395155,17930,0,0,0,0,0,0,0,0,395155,24685,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d475Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,395155,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,51270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.21157368627501,27.223358654357153,0.7839493682353559,0.04537226442392859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +477,Conv2d-GroupNorm477,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm477XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm477XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +478,Conv2d477Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d477Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d477Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +479,SkipConnection-Einsum474,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum474einsum,MXU,1,Compute,44069,44069,12843,0,0,0,0,0,0,0,0,44069,2742,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum474einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,44069,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6852,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.070335156232275,174.85507900253012,0.7810517158825228,0.2914251316708835,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +480,SpatialTransformer-Input_GroupNorm480,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm480XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm480XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +481,SpatialTransformer-Proj_in481,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin481einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin481einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +482,BasicTransformerBlock-Input_layernorm482,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm482XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm482XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-Q-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Q483MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483Q483MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-K-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483K483MatMulK,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483K483MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-V-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483V483MatMulV,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483V483MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +484,SelfAttention483-FlashAttention-484,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention483FlashAttention484FlashAttention,MXU,1,Compute,1498332,1498332,16277,0,0,0,0,0,0,0,0,1498332,280867,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention483FlashAttention484FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,187245,1498332,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,188994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.224811462346129,6.517664309378696,0.06616029293481915,0.010862773848964493,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +485,SelfAttention483-Attention_output-485,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +486,SelfAttention483-Attention_layernorm-486,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention483Attentionlayernorm486YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention483Attentionlayernorm486YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +487,GatedSelfAttention-Linear487,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear487XLinearcontext,MXU,1,Memory,790,171,790,0,0,0,0,0,0,0,0,0,171,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear487XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,171,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.977417721518988,599.9697914606409,0.10211679608552283,0.9999496524344015,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-Q-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Q488MatMulQ,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487Q488MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-K-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487K488MatMulK,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487K488MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-V-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487V488MatMulV,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487V488MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +489,GatedSelfAttention-Attn487-FlashAttention-489,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn487FlashAttention489FlashAttention,MXU,1,Compute,1593418,1593418,16308,0,0,0,0,0,0,0,0,1593418,287542,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn487FlashAttention489FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,187978,1593418,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.044230081497761,6.140697849734423,0.0624554819109802,0.010234496416224037,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +490,GatedSelfAttention-Attn487-Attention_output-490,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +491,GatedSelfAttention-Attn487-Attention_layernorm-491,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,VPU,1,Memory,8154,3665,8154,0,0,0,0,0,0,0,0,0,3665,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3665,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1769,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288476821192053,599.9937752224752,0.44938505203405865,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +492,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +493,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +494,GatedSelfAttention-FFN487Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +495,BasicTransformerBlock-Fuser_output_layernorm495,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-Q-496,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Q496MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention496Q496MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-K-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496K496MatMulK,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496K496MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-V-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496V496MatMulV,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496V496MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +497,CrossAttention496-FlashAttention-497,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention496FlashAttention497FlashAttention,MXU,1,Compute,187612,187612,9156,0,0,0,0,0,0,0,0,187612,35107,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention496FlashAttention497FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,23405,187612,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24409,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2193024753214083,29.279385447092938,0.06604727045285846,0.0487989757451549,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +498,CrossAttention496-Attention_output-498,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +499,CrossAttention496-Attention_layernorm-499,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention496Attentionlayernorm499YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention496Attentionlayernorm499YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +500,BasicTransformerBlock-Attn_output_layernorm500,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +501,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +502,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +503,BasicTransformerBlock-FFN501Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +504,SpatialTransformer-Proj_out504,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout504einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout504einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +505,Time-Embed-MLP-Einsum505,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum505einsum,MXU,1,Memory,1277,285,1277,0,0,0,0,0,0,0,0,0,285,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum505einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,285,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6415035238841034,599.7804897132103,0.013161098425274574,0.9996341495220171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +506,Conv2d-GroupNorm506,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm506XnormGroupNormX,VPU,1,Memory,16277,7315,16277,0,0,0,0,0,0,0,0,0,7315,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm506XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,7315,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3531,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884143269644281,599.9646740799901,0.4493632557772141,0.9999411234666502,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +507,Conv2d506Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d506Conv2dconv2d,MXU,1,Compute,395155,395155,17930,0,0,0,0,0,0,0,0,395155,24685,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d506Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,395155,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,51270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.21157368627501,27.223358654357153,0.7839493682353559,0.04537226442392859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +508,Conv2d-GroupNorm508,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm508XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm508XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +509,Conv2d508Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d508Conv2dconv2d,MXU,1,Compute,202058,202058,11000,0,0,0,0,0,0,0,0,202058,12617,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d508Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,202058,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,26408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.36425778736798,32.66104915188337,0.7665658192327006,0.05443508191980562,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +510,SkipConnection-Einsum505,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum505einsum,MXU,1,Compute,44069,44069,12843,0,0,0,0,0,0,0,0,44069,2742,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum505einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,44069,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6852,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.070335156232275,174.85507900253012,0.7810517158825228,0.2914251316708835,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +511,SpatialTransformer-Input_GroupNorm511,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm511XnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm511XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +512,SpatialTransformer-Proj_in512,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin512einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin512einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +513,BasicTransformerBlock-Input_layernorm513,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm513XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm513XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-Q-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Q514MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514Q514MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-K-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514K514MatMulK,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514K514MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-V-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514V514MatMulV,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514V514MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +515,SelfAttention514-FlashAttention-515,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention514FlashAttention515FlashAttention,MXU,1,Compute,1498332,1498332,16277,0,0,0,0,0,0,0,0,1498332,280867,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention514FlashAttention515FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,187245,1498332,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,188994,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.224811462346129,6.517664309378696,0.06616029293481915,0.010862773848964493,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +516,SelfAttention514-Attention_output-516,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +517,SelfAttention514-Attention_layernorm-517,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention514Attentionlayernorm517YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention514Attentionlayernorm517YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +518,GatedSelfAttention-Linear518,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear518XLinearcontext,MXU,1,Memory,790,171,790,0,0,0,0,0,0,0,0,0,171,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear518XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,171,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.977417721518988,599.9697914606409,0.10211679608552283,0.9999496524344015,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-Q-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Q519MatMulQ,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518Q519MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-K-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518K519MatMulK,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518K519MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-V-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518V519MatMulV,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518V519MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +520,GatedSelfAttention-Attn518-FlashAttention-520,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn518FlashAttention520FlashAttention,MXU,1,Compute,1593418,1593418,16308,0,0,0,0,0,0,0,0,1593418,287542,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn518FlashAttention520FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,187978,1593418,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200883,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.044230081497761,6.140697849734423,0.0624554819109802,0.010234496416224037,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +521,GatedSelfAttention-Attn518-Attention_output-521,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,MXU,1,Compute,27338,27338,8472,0,0,0,0,0,0,0,0,27338,1697,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,27338,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,30.744721632891945,185.93474674245783,0.6307592903281731,0.30989124457076306,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +522,GatedSelfAttention-Attn518-Attention_layernorm-522,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,VPU,1,Memory,8154,3665,8154,0,0,0,0,0,0,0,0,0,3665,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3665,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1769,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288476821192053,599.9937752224752,0.44938505203405865,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +523,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +524,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +525,GatedSelfAttention-FFN518Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +526,BasicTransformerBlock-Fuser_output_layernorm526,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-Q-527,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Q527MatMulQ,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention527Q527MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-K-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527K527MatMulK,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527K527MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-V-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527V527MatMulV,MXU,1,Compute,6766,6766,2493,0,0,0,0,0,0,0,0,6766,411,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527V527MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,6766,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,37.194537392846584,221.01113333210168,0.7630838324097006,0.3683518888868361,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +528,CrossAttention527-FlashAttention-528,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention527FlashAttention528FlashAttention,MXU,1,Compute,187612,187612,9156,0,0,0,0,0,0,0,0,187612,35107,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention527FlashAttention528FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,23405,187612,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24409,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2193024753214083,29.279385447092938,0.06604727045285846,0.0487989757451549,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +529,CrossAttention527-Attention_output-529,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +530,CrossAttention527-Attention_layernorm-530,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention527Attentionlayernorm530YnormLayerNormy,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention527Attentionlayernorm530YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +531,BasicTransformerBlock-Attn_output_layernorm531,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +532,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +533,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,87955,87955,21617,0,0,0,0,0,0,0,0,87955,5485,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,87955,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,38.149544653516,147.4614371340458,0.7826767794264543,0.24576906189007636,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +534,BasicTransformerBlock-FFN532Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,8139,458,8139,0,0,0,0,0,0,0,0,0,458,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,458,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,966,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16104189703894828,599.9278166850964,0.05616695627753498,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +535,SpatialTransformer-Proj_out535,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout535einsum,MXU,1,Compute,26515,26515,8456,0,0,0,0,0,0,0,0,26515,1645,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout535einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,26515,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.637216669809543,191.3463082512257,0.6490697353804807,0.3189105137520428,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +536,Out536-GroupNorm,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Out536GroupNormXnormGroupNormX,VPU,1,Memory,8139,3658,8139,0,0,0,0,0,0,0,0,0,3658,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Out536GroupNormXnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2883351763115862,599.9278166850964,0.4493356502202798,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +537,Out537-Conv2d,"Conv2D(a=1x320x64x64,b=320x3x3x3,c=1x3x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Out537Conv2dconv2d,MXU,1,Compute,67475,67475,4134,0,0,0,0,0,0,0,0,67475,4205,0,0,2663296,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,3,3,3]","[DT_BFLOAT16:(1,3,64,64)]",70778880,Out537Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 3, 3, 3], [1, 3, 64, 64]]",1,2829696,736,1,3,4096,2880,0,67475,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8866,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.0489645053723602,36.76009911349545,0.02152057562558184,0.06126683185582575,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json new file mode 100644 index 0000000..a41a955 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.json @@ -0,0 +1,184 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 37940976, + "overlapped_compute_time_non_pp_ns": 5578806, + "compute_only_time_non_pp_ns": 31695553, + "memory_only_time_non_pp_ns": 666617, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 37940976, + "overlapped_compute_time_chip_ns": 5578806, + "compute_only_time_chip_ns": 31695553, + "memory_only_time_chip_ns": 666617, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 0.6589182102221092, + "throughput_step_per_sec_per_request": 26.35672840888437, + "latency_sec": 1.5176390400000002, + "latency_step_sec": 0.0009485244, + "mem_footprint_GB": 15.999999046325684, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "gligen", + "model_type": "gligen", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 2, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "num_diffusion_steps": 1, + "total_num_diffusion_steps": 40, + "image_resolution": [ + 512, + 512 + ], + "image_num_channels": 3, + "use_flash_attention": true, + "fourier_embedder_config": { + "num_freqs": 64 + }, + "text_embedder_config": { + "d_model": 512, + "num_heads": 8, + "d_head": 64, + "d_ff": 2048, + "num_layers": 12, + "ffn_type": "default" + }, + "image_embedder_config": { + "model_type": "vit", + "patch_size": 2, + "d_model": 1024, + "num_heads": 16, + "d_head": 64, + "d_ff": 4096, + "num_layers": 24, + "ffn_type": "default" + }, + "spatial_condition_embedder_config": { + "model_type": "convnext", + "stem": { + "in_channels": 3, + "out_channels": 96, + "kernel_size": 4, + "stride": 4 + }, + "depths": [ + 3, + 3, + 9, + 3 + ], + "dims": [ + 96, + 192, + 384, + 768 + ] + }, + "grounding_input_config": { + "text": { + "input_seqlen": 512, + "feature_dim": 768 + }, + "bbox": { + "input_seqlen": 8, + "feature_dim": 4, + "grounding_token_feature_dim": 768 + }, + "image": { + "resolution": [ + 1024, + 1024 + ], + "image_num_channels": 3 + }, + "keypoint": { + "num_persons": 10, + "num_keypoints": 17, + "feature_dim": 256 + }, + "spatial_condition": { + "resolution": [ + 256, + 256 + ], + "num_channels": 1 + } + }, + "unet_config": { + "noisy_latent_resolution": [ + 64, + 64 + ], + "model_channels": 320, + "attention_resolutions": [ + 4, + 2, + 1 + ], + "num_res_blocks": 2, + "channel_mult": [ + 1, + 2, + 4, + 4 + ], + "num_heads": 8, + "context_dim": 768 + }, + "output_dir": "./llava_ops" + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv new file mode 100644 index 0000000..1d5b694 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv @@ -0,0 +1,635 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Time-Embed-MLP-FFi2,"XlaEinsum(a=1x320,b=320x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFi2einsum,MXU,1,Memory,852,212,852,0,0,0,0,0,0,0,0,0,212,0,0,822400,"DT_BFLOAT16:[1,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,1280)]",819200,TimeEmbedMLPFFi2einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 320], [320, 1280], [1, 1280]]",1,822400,30,1,1,1280,320,0,212,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9615023474178404,898.9667668588844,0.007567433399018655,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Time-Embed-MLP-FFo2,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFo2einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPFFo2einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Conv2d5Conv2d,"Conv2D(a=1x3x64x64,b=3x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d5Conv2dconv2d,MXU,1,Compute,3337,3337,2756,0,0,0,0,0,0,0,0,3337,408,0,0,2663296,"DT_BFLOAT16:[1,3,64,64],DT_BFLOAT16:[3,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",70778880,Conv2d5Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 3, 64, 64], [3, 320, 3, 3], [1, 320, 64, 64]]",1,2664856,96,1,320,4096,27,0,3337,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,21.21033263410249,743.2986777593964,0.16693436059792643,0.8258874197326627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Time-Embed-MLP-Einsum6,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum6einsum,MXU,1,Memory,852,212,852,0,0,0,0,0,0,0,0,0,212,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum6einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,212,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9615023474178404,898.9667668588844,0.007567433399018655,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Conv2d-GroupNorm7,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm7XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm7XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Conv2d7Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d7Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d7Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Conv2d-GroupNorm9,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm9XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm9XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Conv2d9Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d9Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d9Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,SpatialTransformer-Input_GroupNorm11,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm11XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm11XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,SpatialTransformer-Proj_in12,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin12einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin12einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,BasicTransformerBlock-Input_layernorm13,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm13XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm13XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-Q-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Q14MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14Q14MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-K-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14K14MatMulK,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14K14MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-V-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14V14MatMulV,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14V14MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,SelfAttention14-FlashAttention-15,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention14FlashAttention15FlashAttention,MXU,1,Compute,557890,557890,10851,0,0,0,0,0,0,0,0,557890,209156,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention14FlashAttention15FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,139438,557890,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,140740,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.660915607019305,17.504570793525605,0.06816509830335099,0.01944952310391734,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,SelfAttention14-Attention_output-16,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,SelfAttention14-Attention_layernorm-17,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention14Attentionlayernorm17YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention14Attentionlayernorm17YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,GatedSelfAttention-Linear18,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear18XLinearcontext,MXU,1,Memory,527,127,527,0,0,0,0,0,0,0,0,0,127,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear18XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,127,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.461404174573055,899.3854558897651,0.05872443193287797,0.9993171732108501,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-Q-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Q19MatMulQ,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18Q19MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-K-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18K19MatMulK,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18K19MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-V-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18V19MatMulV,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18V19MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,GatedSelfAttention-Attn18-FlashAttention-20,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn18FlashAttention20FlashAttention,MXU,1,Compute,593294,593294,10872,0,0,0,0,0,0,0,0,593294,214128,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn18FlashAttention20FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,139984,593294,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.175931339268558,16.492158164970697,0.06434806534900428,0.018324620183300774,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,GatedSelfAttention-Attn18-Attention_output-21,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,GatedSelfAttention-Attn18-Attention_layernorm-22,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,VPU,1,Memory,5436,2729,5436,0,0,0,0,0,0,0,0,0,2729,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2729,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1317,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327152317880796,899.9906628337128,0.5019726645061294,0.9999896253707921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,GatedSelfAttention-FFN18Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,BasicTransformerBlock-Fuser_output_layernorm26,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-Q-27,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Q27MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention27Q27MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-K-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27K27MatMulK,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27K27MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-V-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27V27MatMulV,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27V27MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,CrossAttention27-FlashAttention-28,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention27FlashAttention28FlashAttention,MXU,1,Compute,69856,69856,6104,0,0,0,0,0,0,0,0,69856,26143,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention27FlashAttention28FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,17429,69856,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.646068712780577,78.63553685438616,0.06804824691589928,0.08737281872709574,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,CrossAttention27-Attention_output-29,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,CrossAttention27-Attention_layernorm-30,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention27Attentionlayernorm30YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention27Attentionlayernorm30YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +31,BasicTransformerBlock-Attn_output_layernorm31,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +32,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +33,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +34,BasicTransformerBlock-FFN32Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +35,SpatialTransformer-Proj_out35,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout35einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout35einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +36,Time-Embed-MLP-Einsum36,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum36einsum,MXU,1,Memory,852,212,852,0,0,0,0,0,0,0,0,0,212,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum36einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,212,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9615023474178404,898.9667668588844,0.007567433399018655,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +37,Conv2d-GroupNorm37,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm37XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm37XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +38,Conv2d37Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d37Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d37Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +39,Conv2d-GroupNorm39,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm39XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm39XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +40,Conv2d39Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d39Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d39Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +41,SpatialTransformer-Input_GroupNorm41,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm41XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm41XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +42,SpatialTransformer-Proj_in42,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin42einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin42einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +43,BasicTransformerBlock-Input_layernorm43,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm43XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm43XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-Q-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Q44MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44Q44MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-K-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44K44MatMulK,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44K44MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-V-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44V44MatMulV,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44V44MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +45,SelfAttention44-FlashAttention-45,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention44FlashAttention45FlashAttention,MXU,1,Compute,557890,557890,10851,0,0,0,0,0,0,0,0,557890,209156,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention44FlashAttention45FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,139438,557890,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,140740,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.660915607019305,17.504570793525605,0.06816509830335099,0.01944952310391734,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +46,SelfAttention44-Attention_output-46,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +47,SelfAttention44-Attention_layernorm-47,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention44Attentionlayernorm47YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention44Attentionlayernorm47YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +48,GatedSelfAttention-Linear48,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear48XLinearcontext,MXU,1,Memory,527,127,527,0,0,0,0,0,0,0,0,0,127,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear48XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,127,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.461404174573055,899.3854558897651,0.05872443193287797,0.9993171732108501,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-Q-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Q49MatMulQ,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48Q49MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-K-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48K49MatMulK,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48K49MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-V-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48V49MatMulV,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48V49MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +50,GatedSelfAttention-Attn48-FlashAttention-50,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn48FlashAttention50FlashAttention,MXU,1,Compute,593294,593294,10872,0,0,0,0,0,0,0,0,593294,214128,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn48FlashAttention50FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,139984,593294,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.175931339268558,16.492158164970697,0.06434806534900428,0.018324620183300774,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +51,GatedSelfAttention-Attn48-Attention_output-51,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +52,GatedSelfAttention-Attn48-Attention_layernorm-52,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,VPU,1,Memory,5436,2729,5436,0,0,0,0,0,0,0,0,0,2729,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2729,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1317,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327152317880796,899.9906628337128,0.5019726645061294,0.9999896253707921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +53,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +54,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +55,GatedSelfAttention-FFN48Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,BasicTransformerBlock-Fuser_output_layernorm56,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-Q-57,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Q57MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention57Q57MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-K-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57K57MatMulK,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57K57MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-V-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57V57MatMulV,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57V57MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +58,CrossAttention57-FlashAttention-58,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention57FlashAttention58FlashAttention,MXU,1,Compute,69856,69856,6104,0,0,0,0,0,0,0,0,69856,26143,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention57FlashAttention58FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,17429,69856,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.646068712780577,78.63553685438616,0.06804824691589928,0.08737281872709574,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +59,CrossAttention57-Attention_output-59,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +60,CrossAttention57-Attention_layernorm-60,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention57Attentionlayernorm60YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention57Attentionlayernorm60YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +61,BasicTransformerBlock-Attn_output_layernorm61,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +62,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +63,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +64,BasicTransformerBlock-FFN62Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +65,SpatialTransformer-Proj_out65,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout65einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout65einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +66,Downsample-Conv2d66Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d66Conv2dconv2d,MXU,1,Compute,18860,18860,5299,0,0,0,0,0,0,0,0,18860,2348,0,0,5120000,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,32,32)]",1887436800,DownsampleConv2d66Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 32, 32]]",1,5286400,552,1,320,1024,2880,0,18860,5120000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5334,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.07618239660658,252.82988239826352,0.7876422217253878,0.28092209155362613,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +67,Time-Embed-MLP-Einsum67,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum67einsum,MXU,1,Memory,1700,425,1700,0,0,0,0,0,0,0,0,0,425,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum67einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,425,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9637647058823529,899.6795205508961,0.007585239124663404,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +68,Conv2d-GroupNorm68,"GroupNorm(x=1x320x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm68XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,320,32,32]","[DT_BFLOAT16:(1,320,32,32)]",2621440,Conv2dGroupNorm68XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +69,Conv2d68Conv2d,"Conv2D(a=1x320x32x32,b=320x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d68Conv2dconv2d,MXU,1,Compute,31388,31388,5850,0,0,0,0,0,0,0,0,31388,3914,0,0,5652480,"DT_BFLOAT16:[1,320,32,32],DT_BFLOAT16:[320,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",3774873600,Conv2d68Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 320, 32, 32], [320, 640, 3, 3], [1, 640, 32, 32]]",1,5736960,920,1,640,1024,2880,0,31388,5652480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8530,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,120.2648655537148,167.71639564682363,0.9465357653715314,0.18635155071869292,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +70,Conv2d-GroupNorm70,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm70XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm70XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +71,Conv2d70Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d70Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d70Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +72,SkipConnection-Einsum67,"XlaEinsum(a=1x32x32x320,b=320x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum67einsum,MXU,1,Compute,4154,4154,2459,0,0,0,0,0,0,0,0,4154,510,0,0,2375680,"DT_BFLOAT16:[1,32,32,320],DT_BFLOAT16:[320,640]","[DT_BFLOAT16:(1,32,32,640)]",419430400,SkipConnectionEinsum67einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 320], [320, 640], [1, 32, 32, 640]]",1,2375680,120,1,1024,640,320,0,4154,2375680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1325,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.97024554646123,532.625039495065,0.7946788798877018,0.5918055994389612,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +73,SpatialTransformer-Input_GroupNorm73,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm73XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm73XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +74,SpatialTransformer-Proj_in74,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin74einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin74einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +75,BasicTransformerBlock-Input_layernorm75,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm75XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm75XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-Q-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Q76MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76Q76MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-K-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76K76MatMulK,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76K76MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-V-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76V76MatMulV,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76V76MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +77,SelfAttention76-FlashAttention-77,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention76FlashAttention77FlashAttention,MXU,1,Compute,34996,34996,5426,0,0,0,0,0,0,0,0,34996,13070,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention76FlashAttention77FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,8714,34996,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9383,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.629268716424734,139.52487427134528,0.06791602378210453,0.15502763807927253,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +78,SelfAttention76-Attention_output-78,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +79,SelfAttention76-Attention_layernorm-79,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention76Attentionlayernorm79YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention76Attentionlayernorm79YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +80,GatedSelfAttention-Linear80,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear80XLinearcontext,MXU,1,Memory,1041,255,1041,0,0,0,0,0,0,0,0,0,255,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear80XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,255,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.554582132564842,899.623610673331,0.05945778218756329,0.9995817896370344,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-Q-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Q81MatMulQ,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80Q81MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-K-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80K81MatMulK,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80K81MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-V-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80V81MatMulV,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80V81MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +82,GatedSelfAttention-Attn80-FlashAttention-82,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn80FlashAttention82FlashAttention,MXU,1,Compute,44256,44256,5468,0,0,0,0,0,0,0,0,44256,14366,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn80FlashAttention82FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,8852,44256,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.930741865509762,111.19304665257253,0.05454789331912377,0.12354782961396948,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +83,GatedSelfAttention-Attn80-Attention_output-83,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +84,GatedSelfAttention-Attn80-Attention_layernorm-84,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,VPU,1,Memory,2734,1373,2734,0,0,0,0,0,0,0,0,0,1373,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1373,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,662,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326408193123628,899.9560118244788,0.5019533377951407,0.9999511242494209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +85,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +86,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +87,GatedSelfAttention-FFN80Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +88,BasicTransformerBlock-Fuser_output_layernorm88,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-Q-89,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Q89MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention89Q89MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-K-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89K89MatMulK,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89K89MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-V-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89V89MatMulV,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89V89MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +90,CrossAttention89-FlashAttention-90,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention89FlashAttention90FlashAttention,MXU,1,Compute,17566,17566,4070,0,0,0,0,0,0,0,0,17566,6535,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention89FlashAttention90FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,4357,17566,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.595863827849254,208.4771362290789,0.06765311306724724,0.23164126247675432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +91,CrossAttention89-Attention_output-91,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +92,CrossAttention89-Attention_layernorm-92,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention89Attentionlayernorm92YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention89Attentionlayernorm92YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +93,BasicTransformerBlock-Attn_output_layernorm93,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +94,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +95,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +96,BasicTransformerBlock-FFN94Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +97,SpatialTransformer-Proj_out97,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout97einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout97einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +98,Time-Embed-MLP-Einsum98,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum98einsum,MXU,1,Memory,1700,425,1700,0,0,0,0,0,0,0,0,0,425,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum98einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,425,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9637647058823529,899.6795205508961,0.007585239124663404,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +99,Conv2d-GroupNorm99,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm99XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm99XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +100,Conv2d99Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d99Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d99Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +101,Conv2d-GroupNorm101,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm101XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm101XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +102,Conv2d101Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d101Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d101Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +103,SpatialTransformer-Input_GroupNorm103,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm103XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm103XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +104,SpatialTransformer-Proj_in104,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin104einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin104einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +105,BasicTransformerBlock-Input_layernorm105,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm105XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm105XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-Q-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Q106MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106Q106MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-K-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106K106MatMulK,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106K106MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-V-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106V106MatMulV,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106V106MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +107,SelfAttention106-FlashAttention-107,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention106FlashAttention107FlashAttention,MXU,1,Compute,34996,34996,5426,0,0,0,0,0,0,0,0,34996,13070,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention106FlashAttention107FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,8714,34996,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9383,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.629268716424734,139.52487427134528,0.06791602378210453,0.15502763807927253,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +108,SelfAttention106-Attention_output-108,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +109,SelfAttention106-Attention_layernorm-109,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention106Attentionlayernorm109YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention106Attentionlayernorm109YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +110,GatedSelfAttention-Linear110,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear110XLinearcontext,MXU,1,Memory,1041,255,1041,0,0,0,0,0,0,0,0,0,255,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear110XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,255,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.554582132564842,899.623610673331,0.05945778218756329,0.9995817896370344,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-Q-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Q111MatMulQ,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110Q111MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-K-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110K111MatMulK,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110K111MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-V-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110V111MatMulV,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110V111MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,GatedSelfAttention-Attn110-FlashAttention-112,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn110FlashAttention112FlashAttention,MXU,1,Compute,44256,44256,5468,0,0,0,0,0,0,0,0,44256,14366,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn110FlashAttention112FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,8852,44256,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.930741865509762,111.19304665257253,0.05454789331912377,0.12354782961396948,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +113,GatedSelfAttention-Attn110-Attention_output-113,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +114,GatedSelfAttention-Attn110-Attention_layernorm-114,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,VPU,1,Memory,2734,1373,2734,0,0,0,0,0,0,0,0,0,1373,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1373,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,662,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326408193123628,899.9560118244788,0.5019533377951407,0.9999511242494209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +115,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +116,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +117,GatedSelfAttention-FFN110Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +118,BasicTransformerBlock-Fuser_output_layernorm118,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-Q-119,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Q119MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention119Q119MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-K-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119K119MatMulK,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119K119MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-V-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119V119MatMulV,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119V119MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +120,CrossAttention119-FlashAttention-120,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention119FlashAttention120FlashAttention,MXU,1,Compute,17566,17566,4070,0,0,0,0,0,0,0,0,17566,6535,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention119FlashAttention120FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,4357,17566,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.595863827849254,208.4771362290789,0.06765311306724724,0.23164126247675432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +121,CrossAttention119-Attention_output-121,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +122,CrossAttention119-Attention_layernorm-122,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention119Attentionlayernorm122YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention119Attentionlayernorm122YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +123,BasicTransformerBlock-Attn_output_layernorm123,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +124,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +125,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +126,BasicTransformerBlock-FFN124Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +127,SpatialTransformer-Proj_out127,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout127einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout127einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +128,Downsample-Conv2d128Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d128Conv2dconv2d,MXU,1,Compute,15388,15388,9325,0,0,0,0,0,0,0,0,15388,1914,0,0,9011200,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,16,16)]",1887436800,DownsampleConv2d128Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 16, 16]]",1,9180160,450,1,640,256,5760,0,15388,9011200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4936,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.65640759033012,545.3817250048739,0.9653582208045758,0.6059796944498599,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +129,Time-Embed-MLP-Einsum129,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum129einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum129einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +130,Conv2d-GroupNorm130,"GroupNorm(x=1x640x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm130XnormGroupNormX,VPU,1,Memory,679,341,679,0,0,0,0,0,0,0,0,0,341,0,0,655360,"DT_BFLOAT16:[1,640,16,16]","[DT_BFLOAT16:(1,640,16,16)]",1310720,Conv2dGroupNorm130XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,341,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9303681885125183,898.897735640648,0.501363080876132,0.9987752618229422,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +131,Conv2d130Conv2d,"Conv2D(a=1x640x16x16,b=640x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d130Conv2dconv2d,MXU,1,Compute,30707,30707,16277,0,0,0,0,0,0,0,0,30707,3829,0,0,15728640,"DT_BFLOAT16:[1,640,16,16],DT_BFLOAT16:[640,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",3774873600,Conv2d130Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 640, 16, 16], [640, 1280, 3, 3], [1, 1280, 16, 16]]",1,15815680,900,1,1280,256,5760,0,30707,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9579,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.93202201452438,477.03903018855635,0.9675274238278446,0.5300433668761737,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +132,Conv2d-GroupNorm132,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm132XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm132XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +133,Conv2d132Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d132Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d132Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +134,SkipConnection-Einsum129,"XlaEinsum(a=1x16x16x640,b=640x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum129einsum,MXU,1,Compute,3473,3473,2713,0,0,0,0,0,0,0,0,3473,425,0,0,2621440,"DT_BFLOAT16:[1,16,16,640],DT_BFLOAT16:[640,1280]","[DT_BFLOAT16:(1,16,16,1280)]",419430400,SkipConnectionEinsum129einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 640], [640, 1280], [1, 16, 16, 1280]]",1,2621440,100,1,256,1280,640,0,3473,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,120.76890296573568,702.9675352720991,0.9505027546943605,0.7810750391912212,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +135,SpatialTransformer-Input_GroupNorm135,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm135XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm135XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +136,SpatialTransformer-Proj_in136,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin136einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin136einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +137,BasicTransformerBlock-Input_layernorm137,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm137XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm137XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-Q-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Q138MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138Q138MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-K-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138K138MatMulK,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138K138MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-V-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138V138MatMulV,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138V138MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +139,SelfAttention138-FlashAttention-139,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention138FlashAttention139FlashAttention,MXU,1,Compute,4494,4494,2713,0,0,0,0,0,0,0,0,4494,1088,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention138FlashAttention139FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,544,4494,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.199903871829106,543.2590676457498,0.0330550340492675,0.6036211862730554,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +140,SelfAttention138-Attention_output-140,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +141,SelfAttention138-Attention_layernorm-141,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention138Attentionlayernorm141YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention138Attentionlayernorm141YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +142,GatedSelfAttention-Linear142,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear142XLinearcontext,MXU,1,Memory,2069,510,2069,0,0,0,0,0,0,0,0,0,510,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear142XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,510,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.602049299178347,899.7449326365394,0.05983136902586118,0.9997165918183771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-Q-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Q143MatMulQ,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142Q143MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-K-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142K143MatMulK,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142K143MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-V-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142V143MatMulV,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142V143MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +144,GatedSelfAttention-Attn142-FlashAttention-144,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn142FlashAttention144FlashAttention,MXU,1,Compute,9942,9942,2798,0,0,0,0,0,0,0,0,9942,1803,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn142FlashAttention144FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,579,9942,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.018954737477369,253.23880459791792,0.015890034540761953,0.28137644955324215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +145,GatedSelfAttention-Attn142-Attention_output-145,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +146,GatedSelfAttention-Attn142-Attention_layernorm-146,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,VPU,1,Memory,1399,703,1399,0,0,0,0,0,0,0,0,0,703,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,703,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9323516797712652,899.8213707335597,0.5018782412969751,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +147,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +148,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +149,GatedSelfAttention-FFN142Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +150,BasicTransformerBlock-Fuser_output_layernorm150,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-Q-151,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Q151MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention151Q151MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-K-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151K151MatMulK,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151K151MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-V-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151V151MatMulV,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151V151MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +152,CrossAttention151-FlashAttention-152,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention151FlashAttention152FlashAttention,MXU,1,Compute,8852,8852,4070,0,0,0,0,0,0,0,0,8852,2177,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention151FlashAttention152FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1089,8852,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.264430185268866,413.7041770221419,0.03356288364604793,0.45967130780237986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +153,CrossAttention151-Attention_output-153,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +154,CrossAttention151-Attention_layernorm-154,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention151Attentionlayernorm154YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention151Attentionlayernorm154YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +155,BasicTransformerBlock-Attn_output_layernorm155,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +156,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +157,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +158,BasicTransformerBlock-FFN156Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +159,SpatialTransformer-Proj_out159,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout159einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout159einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +160,Time-Embed-MLP-Einsum160,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum160einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum160einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +161,Conv2d-GroupNorm161,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm161XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm161XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +162,Conv2d161Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d161Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d161Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +163,Conv2d-GroupNorm163,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm163XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm163XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +164,Conv2d163Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d163Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d163Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +165,SpatialTransformer-Input_GroupNorm165,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm165XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm165XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +166,SpatialTransformer-Proj_in166,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin166einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin166einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +167,BasicTransformerBlock-Input_layernorm167,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm167XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm167XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-Q-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Q168MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168Q168MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-K-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168K168MatMulK,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168K168MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-V-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168V168MatMulV,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168V168MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +169,SelfAttention168-FlashAttention-169,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention168FlashAttention169FlashAttention,MXU,1,Compute,4494,4494,2713,0,0,0,0,0,0,0,0,4494,1088,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention168FlashAttention169FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,544,4494,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.199903871829106,543.2590676457498,0.0330550340492675,0.6036211862730554,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +170,SelfAttention168-Attention_output-170,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +171,SelfAttention168-Attention_layernorm-171,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention168Attentionlayernorm171YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention168Attentionlayernorm171YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +172,GatedSelfAttention-Linear172,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear172XLinearcontext,MXU,1,Memory,2069,510,2069,0,0,0,0,0,0,0,0,0,510,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear172XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,510,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.602049299178347,899.7449326365394,0.05983136902586118,0.9997165918183771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-Q-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Q173MatMulQ,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172Q173MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-K-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172K173MatMulK,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172K173MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-V-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172V173MatMulV,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172V173MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +174,GatedSelfAttention-Attn172-FlashAttention-174,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn172FlashAttention174FlashAttention,MXU,1,Compute,9942,9942,2798,0,0,0,0,0,0,0,0,9942,1803,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn172FlashAttention174FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,579,9942,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.018954737477369,253.23880459791792,0.015890034540761953,0.28137644955324215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +175,GatedSelfAttention-Attn172-Attention_output-175,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +176,GatedSelfAttention-Attn172-Attention_layernorm-176,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,VPU,1,Memory,1399,703,1399,0,0,0,0,0,0,0,0,0,703,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,703,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9323516797712652,899.8213707335597,0.5018782412969751,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +177,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +178,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +179,GatedSelfAttention-FFN172Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +180,BasicTransformerBlock-Fuser_output_layernorm180,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-Q-181,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Q181MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention181Q181MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-K-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181K181MatMulK,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181K181MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-V-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181V181MatMulV,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181V181MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +182,CrossAttention181-FlashAttention-182,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention181FlashAttention182FlashAttention,MXU,1,Compute,8852,8852,4070,0,0,0,0,0,0,0,0,8852,2177,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention181FlashAttention182FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1089,8852,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.264430185268866,413.7041770221419,0.03356288364604793,0.45967130780237986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +183,CrossAttention181-Attention_output-183,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +184,CrossAttention181-Attention_layernorm-184,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention181Attentionlayernorm184YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention181Attentionlayernorm184YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +185,BasicTransformerBlock-Attn_output_layernorm185,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +186,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +187,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +188,BasicTransformerBlock-FFN186Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +189,SpatialTransformer-Proj_out189,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout189einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout189einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +190,Downsample-Conv2d190Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d190Conv2dconv2d,MXU,1,Memory,31366,30707,31366,0,0,0,0,0,0,0,0,30707,3829,0,0,30310400,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,DownsampleConv2d190Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,24420352,900,1,1280,64,11520,0,30707,30310400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11342,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,60.17460944972263,899.9795882683479,0.47359983108272696,0.9999773202981643,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +191,Time-Embed-MLP-Einsum191,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum191einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum191einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +192,Conv2d-GroupNorm192,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm192XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm192XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +193,Conv2d192Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d192Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d192Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +194,Conv2d-GroupNorm194,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm194XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm194XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +195,Conv2d194Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d194Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d194Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +196,Time-Embed-MLP-Einsum196,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum196einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum196einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +197,Conv2d-GroupNorm197,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm197XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm197XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +198,Conv2d197Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d197Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d197Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +199,Conv2d-GroupNorm199,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm199XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm199XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +200,Conv2d199Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d199Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d199Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +201,Time-Embed-MLP-Einsum201,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum201einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum201einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +202,Conv2d-GroupNorm202,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm202XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm202XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +203,Conv2d202Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d202Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d202Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +204,Conv2d-GroupNorm204,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm204XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm204XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +205,Conv2d204Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d204Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d204Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +206,SpatialTransformer-Input_GroupNorm206,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm206XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,SpatialTransformerInputGroupNorm206XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +207,SpatialTransformer-Proj_in207,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin207einsum,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjin207einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +208,BasicTransformerBlock-Input_layernorm208,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm208XnormLayerNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockInputlayernorm208XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-Q-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Q209MatMulQ,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209Q209MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-K-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209K209MatMulK,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209K209MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-V-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209V209MatMulV,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209V209MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +210,SelfAttention209-FlashAttention-210,"FlashAttention(q=1x64x8x160,k=1x64x8x160,v=1x64x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention209FlashAttention210FlashAttention,MXU,1,Memory,11529,1226,11529,0,0,0,0,0,0,0,0,1226,170,0,0,11141120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160]","[DT_BFLOAT16:(1,64,8,8)]",1179648,SelfAttention209FlashAttention210FlashAttention,FlashAttention,0,[],FlashAttention,,"[64, 64]",,11141120,32,8,64,64,160,34,1226,11141120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1653,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.10232006245120999,899.9892933038425,0.000805302514406107,0.999988103670936,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +211,SelfAttention209-Attention_output-211,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +212,SelfAttention209-Attention_layernorm-212,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention209Attentionlayernorm212YnormLayerNormy,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,SelfAttention209Attentionlayernorm212YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +213,GatedSelfAttention-Linear213,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear213XLinearcontext,MXU,1,Memory,2069,510,2069,0,0,0,0,0,0,0,0,0,510,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear213XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,510,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.602049299178347,899.7449326365394,0.05983136902586118,0.9997165918183771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-Q-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Q214MatMulQ,MXU,1,Memory,3773,3473,3773,0,0,0,0,0,0,0,0,3473,425,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213Q214MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,3473,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1309,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,62.531036310628146,899.8358246504771,0.4921459151120068,0.9998175829449746,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-K-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213K214MatMulK,MXU,1,Memory,3773,3473,3773,0,0,0,0,0,0,0,0,3473,425,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213K214MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,3473,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1309,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,62.531036310628146,899.8358246504771,0.4921459151120068,0.9998175829449746,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-V-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213V214MatMulV,MXU,1,Memory,3773,3473,3773,0,0,0,0,0,0,0,0,3473,425,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213V214MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,3473,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1309,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,62.531036310628146,899.8358246504771,0.4921459151120068,0.9998175829449746,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +215,GatedSelfAttention-Attn213-FlashAttention-215,"FlashAttention(q=1x72x8x160,k=1x72x8x160,v=1x72x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn213FlashAttention215FlashAttention,MXU,1,Memory,14496,1226,14496,0,0,0,0,0,0,0,0,1226,179,0,0,14008320,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160]","[DT_BFLOAT16:(1,72,8,8)]",1492992,GatedSelfAttentionAttn213FlashAttention215FlashAttention,FlashAttention,0,[],FlashAttention,,"[72, 72]",,14008320,32,8,72,72,160,43,1226,14008320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.10299337748344371,899.9906628337128,0.0008106017907694674,0.9999896253707921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +216,GatedSelfAttention-Attn213-Attention_output-216,"XlaEinsum(a=1x72x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,MXU,1,Memory,3773,3473,3773,0,0,0,0,0,0,0,0,3473,425,0,0,3645440,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,72,1280)]",235929600,GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 72, 8, 160], [8, 160, 1280], [1, 72, 1280]]",1,2953216,100,1,72,1280,1280,0,3473,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1309,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,62.531036310628146,899.8358246504771,0.4921459151120068,0.9998175829449746,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +217,GatedSelfAttention-Attn213-Attention_layernorm-217,"LayerNorm(x=1x72x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,VPU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,0,192,0,0,368640,"DT_BFLOAT16:[1,72,1280]","[DT_BFLOAT16:(1,72,1280)]",737280,GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,192,368640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.47456,686.6455078125,0.3829787234042554,0.762939453125,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +218,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,14412,13686,14412,0,0,0,0,0,0,0,0,13686,1702,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,13686,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,58.20571745767416,899.9424578909936,0.4581038116921335,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +219,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,14412,13686,14412,0,0,0,0,0,0,0,0,13686,1702,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,13686,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,58.20571745767416,899.9424578909936,0.4581038116921335,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +220,GatedSelfAttention-FFN213Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,22,500,0,0,0,0,0,0,0,0,0,22,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,22,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.04255319148936171,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +221,BasicTransformerBlock-Fuser_output_layernorm221,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-Q-222,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Q222MatMulQ,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,CrossAttention222Q222MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-K-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222K222MatMulK,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222K222MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-V-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222V222MatMulV,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222V222MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +223,CrossAttention222-FlashAttention-223,"FlashAttention(q=1x64x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention222FlashAttention223FlashAttention,MXU,1,Compute,4494,4494,3052,0,0,0,0,0,0,0,0,4494,816,0,0,2949120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,64,8,8)]",9437184,CrossAttention222FlashAttention223FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,128,8,512,64,160,272,4494,2949120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1480,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.099951935914553,611.1664511014686,0.01652751702463375,0.6790738345571874,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,CrossAttention222-Attention_output-224,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +225,CrossAttention222-Attention_layernorm-225,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention222Attentionlayernorm225YnormLayerNormy,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,CrossAttention222Attentionlayernorm225YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +226,BasicTransformerBlock-Attn_output_layernorm226,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +227,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,14412,13686,14412,0,0,0,0,0,0,0,0,13686,1702,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,13686,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,58.20571745767416,899.9424578909936,0.4581038116921335,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +228,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,14412,13686,14412,0,0,0,0,0,0,0,0,13686,1702,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,13686,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,58.20571745767416,899.9424578909936,0.4581038116921335,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +229,BasicTransformerBlock-FFN227Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,22,500,0,0,0,0,0,0,0,0,0,22,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,22,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.04255319148936171,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +230,SpatialTransformer-Proj_out230,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout230einsum,MXU,1,Memory,3730,3473,3730,0,0,0,0,0,0,0,0,3473,425,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjout230einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,3473,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,56.22391420911528,899.9821967158177,0.4425061752082458,0.9999802185731308,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +231,Time-Embed-MLP-Einsum231,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum231einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum231einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +232,Conv2d-GroupNorm232,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm232XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm232XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +233,Conv2d232Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d232Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d232Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +234,Conv2d-GroupNorm234,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm234XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm234XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +235,Conv2d234Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d234Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d234Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +236,Time-Embed-MLP-Einsum236,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum236einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum236einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +237,Conv2d-GroupNorm237,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm237XnormGroupNormX,VPU,1,Memory,679,341,679,0,0,0,0,0,0,0,0,0,341,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm237XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,341,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9303681885125183,898.897735640648,0.501363080876132,0.9987752618229422,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +238,Conv2d237Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d237Conv2dconv2d,MXU,1,Memory,61544,61345,61544,0,0,0,0,0,0,0,0,61345,7659,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d237Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 16, 10, 10], [16, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,61345,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22529,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.33617574418302,899.9968201104089,0.4827418530398028,0.9999964667893432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +239,Conv2d-GroupNorm239,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm239XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm239XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +240,Conv2d239Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d239Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d239Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +241,SkipConnection-Einsum236,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum236einsum,MXU,1,Memory,7291,6877,7291,0,0,0,0,0,0,0,0,6877,851,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum236einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,6877,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,57.52714305307914,899.9148672164312,0.45276314182602034,0.9999054080182569,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +242,Time-Embed-MLP-Einsum242,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum242einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum242einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +243,Conv2d-GroupNorm243,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm243XnormGroupNormX,VPU,1,Memory,679,341,679,0,0,0,0,0,0,0,0,0,341,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm243XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,341,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9303681885125183,898.897735640648,0.501363080876132,0.9987752618229422,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +244,Conv2d243Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d243Conv2dconv2d,MXU,1,Memory,61544,61345,61544,0,0,0,0,0,0,0,0,61345,7659,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d243Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 16, 10, 10], [16, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,61345,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22529,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.33617574418302,899.9968201104089,0.4827418530398028,0.9999964667893432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +245,Conv2d-GroupNorm245,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm245XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm245XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +246,Conv2d245Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d245Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d245Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +247,SkipConnection-Einsum242,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum242einsum,MXU,1,Memory,7291,6877,7291,0,0,0,0,0,0,0,0,6877,851,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum242einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,6877,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,57.52714305307914,899.9148672164312,0.45276314182602034,0.9999054080182569,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +248,Time-Embed-MLP-Einsum248,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum248einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum248einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +249,Conv2d-GroupNorm249,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm249XnormGroupNormX,VPU,1,Memory,679,341,679,0,0,0,0,0,0,0,0,0,341,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm249XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,341,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9303681885125183,898.897735640648,0.501363080876132,0.9987752618229422,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +250,Conv2d249Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d249Conv2dconv2d,MXU,1,Memory,61544,61345,61544,0,0,0,0,0,0,0,0,61345,7659,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d249Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 16, 10, 10], [16, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,61345,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22529,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.33617574418302,899.9968201104089,0.4827418530398028,0.9999964667893432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +251,Conv2d-GroupNorm251,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm251XnormGroupNormX,VPU,1,Memory,500,171,500,0,0,0,0,0,0,0,0,0,171,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm251XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,171,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.34042553191489366,0.6781684027777778,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +252,Conv2d251Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d251Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d251Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +253,SkipConnection-Einsum248,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum248einsum,MXU,1,Memory,7291,6877,7291,0,0,0,0,0,0,0,0,6877,851,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum248einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,6877,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,57.52714305307914,899.9148672164312,0.45276314182602034,0.9999054080182569,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample254,"Upsample(a=1x1280x8x8,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample254Upsample,VPU,1,Memory,848,0,848,0,0,0,0,0,0,0,0,0,0,0,0,819200,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,16,16)]",0,Upsample254Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,819200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,899.692751326651,0.0,0.9996586125851677,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample-Conv2d254Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d254Conv2dconv2d,MXU,1,Memory,30857,30707,30857,0,0,0,0,0,0,0,0,30707,3829,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,UpsampleConv2d254Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,30707,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,61.16721651489127,899.990151140746,0.48141207187156276,0.9999890568230511,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +255,Time-Embed-MLP-Einsum255,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum255einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum255einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +256,Conv2d-GroupNorm256,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm256XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm256XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +257,Conv2d256Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d256Conv2dconv2d,MXU,1,Compute,122622,122622,63070,0,0,0,0,0,0,0,0,122622,15319,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d256Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 1, 18, 18], [1, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,122622,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38026,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.13854283896853,462.9079228238,0.9691528307638637,0.5143421364708889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +258,Conv2d-GroupNorm258,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm258XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm258XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +259,Conv2d258Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d258Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d258Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +260,SkipConnection-Einsum255,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum255einsum,MXU,1,Compute,13686,13686,8817,0,0,0,0,0,0,0,0,13686,1702,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum255einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,13686,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4451,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.58670173900336,579.7581698450972,0.9648096060363917,0.6441757442723303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +261,SpatialTransformer-Input_GroupNorm261,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm261XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm261XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +262,SpatialTransformer-Proj_in262,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin262einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin262einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +263,BasicTransformerBlock-Input_layernorm263,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm263XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm263XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-Q-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Q264MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264Q264MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-K-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264K264MatMulK,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264K264MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-V-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264V264MatMulV,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264V264MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +265,SelfAttention264-FlashAttention-265,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention264FlashAttention265FlashAttention,MXU,1,Compute,4494,4494,2713,0,0,0,0,0,0,0,0,4494,1088,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention264FlashAttention265FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,544,4494,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.199903871829106,543.2590676457498,0.0330550340492675,0.6036211862730554,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +266,SelfAttention264-Attention_output-266,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +267,SelfAttention264-Attention_layernorm-267,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention264Attentionlayernorm267YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention264Attentionlayernorm267YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +268,GatedSelfAttention-Linear268,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear268XLinearcontext,MXU,1,Memory,2069,510,2069,0,0,0,0,0,0,0,0,0,510,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear268XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,510,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.602049299178347,899.7449326365394,0.05983136902586118,0.9997165918183771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-Q-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Q269MatMulQ,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268Q269MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-K-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268K269MatMulK,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268K269MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-V-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268V269MatMulV,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268V269MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +270,GatedSelfAttention-Attn268-FlashAttention-270,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn268FlashAttention270FlashAttention,MXU,1,Compute,9942,9942,2798,0,0,0,0,0,0,0,0,9942,1803,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn268FlashAttention270FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,579,9942,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.018954737477369,253.23880459791792,0.015890034540761953,0.28137644955324215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +271,GatedSelfAttention-Attn268-Attention_output-271,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +272,GatedSelfAttention-Attn268-Attention_layernorm-272,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,VPU,1,Memory,1399,703,1399,0,0,0,0,0,0,0,0,0,703,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,703,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9323516797712652,899.8213707335597,0.5018782412969751,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +273,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +274,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +275,GatedSelfAttention-FFN268Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +276,BasicTransformerBlock-Fuser_output_layernorm276,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-Q-277,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Q277MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention277Q277MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-K-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277K277MatMulK,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277K277MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-V-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277V277MatMulV,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277V277MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +278,CrossAttention277-FlashAttention-278,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention277FlashAttention278FlashAttention,MXU,1,Compute,8852,8852,4070,0,0,0,0,0,0,0,0,8852,2177,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention277FlashAttention278FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1089,8852,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.264430185268866,413.7041770221419,0.03356288364604793,0.45967130780237986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +279,CrossAttention277-Attention_output-279,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +280,CrossAttention277-Attention_layernorm-280,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention277Attentionlayernorm280YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention277Attentionlayernorm280YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +281,BasicTransformerBlock-Attn_output_layernorm281,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +282,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +283,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +284,BasicTransformerBlock-FFN282Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +285,SpatialTransformer-Proj_out285,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout285einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout285einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +286,Time-Embed-MLP-Einsum286,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum286einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum286einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +287,Conv2d-GroupNorm287,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm287XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm287XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +288,Conv2d287Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d287Conv2dconv2d,MXU,1,Compute,122622,122622,63070,0,0,0,0,0,0,0,0,122622,15319,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d287Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 1, 18, 18], [1, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,122622,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38026,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.13854283896853,462.9079228238,0.9691528307638637,0.5143421364708889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +289,Conv2d-GroupNorm289,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm289XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm289XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +290,Conv2d289Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d289Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d289Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +291,SkipConnection-Einsum286,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum286einsum,MXU,1,Compute,13686,13686,8817,0,0,0,0,0,0,0,0,13686,1702,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum286einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,13686,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4451,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.58670173900336,579.7581698450972,0.9648096060363917,0.6441757442723303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +292,SpatialTransformer-Input_GroupNorm292,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm292XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm292XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +293,SpatialTransformer-Proj_in293,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin293einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin293einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +294,BasicTransformerBlock-Input_layernorm294,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm294XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm294XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-Q-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Q295MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295Q295MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-K-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295K295MatMulK,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295K295MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-V-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295V295MatMulV,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295V295MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +296,SelfAttention295-FlashAttention-296,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention295FlashAttention296FlashAttention,MXU,1,Compute,4494,4494,2713,0,0,0,0,0,0,0,0,4494,1088,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention295FlashAttention296FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,544,4494,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.199903871829106,543.2590676457498,0.0330550340492675,0.6036211862730554,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +297,SelfAttention295-Attention_output-297,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +298,SelfAttention295-Attention_layernorm-298,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention295Attentionlayernorm298YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention295Attentionlayernorm298YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +299,GatedSelfAttention-Linear299,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear299XLinearcontext,MXU,1,Memory,2069,510,2069,0,0,0,0,0,0,0,0,0,510,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear299XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,510,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.602049299178347,899.7449326365394,0.05983136902586118,0.9997165918183771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-Q-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Q300MatMulQ,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299Q300MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-K-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299K300MatMulK,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299K300MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-V-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299V300MatMulV,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299V300MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +301,GatedSelfAttention-Attn299-FlashAttention-301,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn299FlashAttention301FlashAttention,MXU,1,Compute,9942,9942,2798,0,0,0,0,0,0,0,0,9942,1803,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn299FlashAttention301FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,579,9942,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.018954737477369,253.23880459791792,0.015890034540761953,0.28137644955324215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +302,GatedSelfAttention-Attn299-Attention_output-302,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +303,GatedSelfAttention-Attn299-Attention_layernorm-303,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,VPU,1,Memory,1399,703,1399,0,0,0,0,0,0,0,0,0,703,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,703,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9323516797712652,899.8213707335597,0.5018782412969751,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +304,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +305,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +306,GatedSelfAttention-FFN299Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +307,BasicTransformerBlock-Fuser_output_layernorm307,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-Q-308,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Q308MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention308Q308MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-K-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308K308MatMulK,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308K308MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-V-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308V308MatMulV,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308V308MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +309,CrossAttention308-FlashAttention-309,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention308FlashAttention309FlashAttention,MXU,1,Compute,8852,8852,4070,0,0,0,0,0,0,0,0,8852,2177,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention308FlashAttention309FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1089,8852,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.264430185268866,413.7041770221419,0.03356288364604793,0.45967130780237986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +310,CrossAttention308-Attention_output-310,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +311,CrossAttention308-Attention_layernorm-311,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention308Attentionlayernorm311YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention308Attentionlayernorm311YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +312,BasicTransformerBlock-Attn_output_layernorm312,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +313,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +314,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +315,BasicTransformerBlock-FFN313Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +316,SpatialTransformer-Proj_out316,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout316einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout316einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +317,Time-Embed-MLP-Einsum317,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum317einsum,MXU,1,Memory,3397,851,3397,0,0,0,0,0,0,0,0,0,851,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum317einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,851,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9646158375036797,899.7722060883224,0.00759193789339287,0.9997468956536916,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +318,Conv2d-GroupNorm318,"GroupNorm(x=1x1920x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm318XnormGroupNormX,VPU,1,Memory,2035,1022,2035,0,0,0,0,0,0,0,0,0,1022,0,0,1966080,"DT_BFLOAT16:[1,1920,16,16]","[DT_BFLOAT16:(1,1920,16,16)]",3932160,Conv2dGroupNorm318XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1022,1966080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,493,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9322653562653562,899.7811732186732,0.5018558210047572,0.9997568591318591,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +319,Conv2d318Conv2d,"Conv2D(a=1x1920x16x16,b=1920x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d318Conv2dconv2d,MXU,1,Compute,91983,91983,47472,0,0,0,0,0,0,0,0,91983,11489,0,0,45875200,"DT_BFLOAT16:[1,1920,16,16],DT_BFLOAT16:[1920,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",11324620800,Conv2d318Conv2dconv2d,Conv2D,44236800,[],Conv2D,bf01;io01->bf01,"[[1, 1, 18, 18], [1, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,2700,1,1280,256,17280,0,91983,45875200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,28544,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.11645412739311,464.483756509355,0.9689789831865114,0.5160930627881722,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +320,Conv2d-GroupNorm320,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm320XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm320XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +321,Conv2d320Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d320Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d320Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +322,SkipConnection-Einsum317,"XlaEinsum(a=1x16x16x1920,b=1920x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum317einsum,MXU,1,Compute,10281,10281,6782,0,0,0,0,0,0,0,0,10281,1276,0,0,6553600,"DT_BFLOAT16:[1,16,16,1920],DT_BFLOAT16:[1920,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1258291200,SkipConnectionEinsum317einsum,Einsum,4915200,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 1920], [1920, 1280], [1, 16, 16, 1280]]",1,3801088,300,1,256,1280,1920,0,10281,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.38996206594689,593.669450928898,0.9632611809318686,0.6596327232543311,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +323,SpatialTransformer-Input_GroupNorm323,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm323XnormGroupNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm323XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +324,SpatialTransformer-Proj_in324,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin324einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin324einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +325,BasicTransformerBlock-Input_layernorm325,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm325XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm325XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-Q-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Q326MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326Q326MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-K-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326K326MatMulK,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326K326MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-V-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326V326MatMulV,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326V326MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +327,SelfAttention326-FlashAttention-327,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention326FlashAttention327FlashAttention,MXU,1,Compute,4494,4494,2713,0,0,0,0,0,0,0,0,4494,1088,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention326FlashAttention327FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,544,4494,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.199903871829106,543.2590676457498,0.0330550340492675,0.6036211862730554,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +328,SelfAttention326-Attention_output-328,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +329,SelfAttention326-Attention_layernorm-329,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention326Attentionlayernorm329YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention326Attentionlayernorm329YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +330,GatedSelfAttention-Linear330,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear330XLinearcontext,MXU,1,Memory,2069,510,2069,0,0,0,0,0,0,0,0,0,510,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear330XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,510,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.602049299178347,899.7449326365394,0.05983136902586118,0.9997165918183771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-Q-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Q331MatMulQ,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330Q331MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-K-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330K331MatMulK,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330K331MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-V-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330V331MatMulV,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330V331MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +332,GatedSelfAttention-Attn330-FlashAttention-332,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn330FlashAttention332FlashAttention,MXU,1,Compute,9942,9942,2798,0,0,0,0,0,0,0,0,9942,1803,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn330FlashAttention332FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,579,9942,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2812,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.018954737477369,253.23880459791792,0.015890034540761953,0.28137644955324215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +333,GatedSelfAttention-Attn330-Attention_output-333,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,MXU,1,Compute,10281,10281,4790,0,0,0,0,0,0,0,0,10281,1276,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,10281,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.14309892033849,419.27904971853417,0.6622420618906597,0.4658656107983713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +334,GatedSelfAttention-Attn330-Attention_layernorm-334,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,VPU,1,Memory,1399,703,1399,0,0,0,0,0,0,0,0,0,703,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,703,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9323516797712652,899.8213707335597,0.5018782412969751,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +335,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +336,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +337,GatedSelfAttention-FFN330Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +338,BasicTransformerBlock-Fuser_output_layernorm338,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-Q-339,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Q339MatMulQ,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention339Q339MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-K-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339K339MatMulK,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339K339MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-V-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339V339MatMulV,MXU,1,Compute,8239,8239,4205,0,0,0,0,0,0,0,0,8239,1021,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339V339MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,8239,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.17902172593762,459.30084810049766,0.9616009905241454,0.5103342756672197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +340,CrossAttention339-FlashAttention-340,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention339FlashAttention340FlashAttention,MXU,1,Compute,8852,8852,4070,0,0,0,0,0,0,0,0,8852,2177,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention339FlashAttention340FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,1089,8852,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.264430185268866,413.7041770221419,0.03356288364604793,0.45967130780237986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +341,CrossAttention339-Attention_output-341,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +342,CrossAttention339-Attention_layernorm-342,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention339Attentionlayernorm342YnormLayerNormy,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention339Attentionlayernorm342YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +343,BasicTransformerBlock-Attn_output_layernorm343,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,VPU,1,Memory,1357,681,1357,0,0,0,0,0,0,0,0,0,681,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,681,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9317907148120854,899.5601510685335,0.5017325451951269,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +344,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +345,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,16955,0,0,0,0,0,0,0,0,27303,3404,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,27303,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,558.8685881588104,0.9672478678690296,0.6209650979542338,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +346,BasicTransformerBlock-FFN344Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1357,86,1357,0,0,0,0,0,0,0,0,0,86,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,86,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24147383935151068,899.5601510685335,0.06271656814939086,0.9995112789650372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +347,SpatialTransformer-Proj_out347,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout347einsum,MXU,1,Compute,6877,6877,4748,0,0,0,0,0,0,0,0,6877,851,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout347einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,6877,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,621.2681310891377,0.9600395716310932,0.6902979234323752,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample348,"Upsample(a=1x1280x16x16,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample348Upsample,VPU,1,Memory,3391,0,3391,0,0,0,0,0,0,0,0,0,0,0,0,3276800,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,32,32)]",0,Upsample348Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,3276800,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,899.958069153642,0.0,0.9999534101707133,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample-Conv2d348Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d348Conv2dconv2d,MXU,1,Compute,61345,61345,31874,0,0,0,0,0,0,0,0,61345,7659,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,UpsampleConv2d348Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,61345,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19061,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,467.62610542831527,0.9686156851734168,0.5195845615870169,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +349,Time-Embed-MLP-Einsum349,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum349einsum,MXU,1,Memory,1700,425,1700,0,0,0,0,0,0,0,0,0,425,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum349einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,425,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9637647058823529,899.6795205508961,0.007585239124663404,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +350,Conv2d-GroupNorm350,"GroupNorm(x=1x1920x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm350XnormGroupNormX,VPU,1,Memory,8139,4086,8139,0,0,0,0,0,0,0,0,0,4086,0,0,7864320,"DT_BFLOAT16:[1,1920,32,32]","[DT_BFLOAT16:(1,1920,32,32)]",15728640,Conv2dGroupNorm350XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,4086,7864320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1972,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +351,Conv2d350Conv2d,"Conv2D(a=1x1920x32x32,b=1920x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d350Conv2dconv2d,MXU,1,Compute,183898,183898,28314,0,0,0,0,0,0,0,0,183898,22978,0,0,27361280,"DT_BFLOAT16:[1,1920,32,32],DT_BFLOAT16:[1920,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",22649241600,Conv2d350Conv2dconv2d,Conv2D,22118400,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 32, 32], [1920, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,5400,1,640,1024,17280,0,183898,27361280,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,49283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.16197892309867,138.56691064815823,0.9693372827376576,0.15396323405350915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +352,Conv2d-GroupNorm352,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm352XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm352XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +353,Conv2d352Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d352Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d352Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +354,SkipConnection-Einsum349,"XlaEinsum(a=1x32x32x1920,b=1920x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum349einsum,MXU,1,Compute,20494,20494,7969,0,0,0,0,0,0,0,0,20494,2553,0,0,7700480,"DT_BFLOAT16:[1,32,32,1920],DT_BFLOAT16:[1920,640]","[DT_BFLOAT16:(1,32,32,640)]",2516582400,SkipConnectionEinsum349einsum,Einsum,2457600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1920], [1920, 640], [1, 32, 32, 640]]",1,4718592,600,1,1024,640,1920,0,20494,7700480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6054,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.79605738264858,349.93807257611985,0.9664573242081138,0.3888200806401332,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +355,SpatialTransformer-Input_GroupNorm355,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm355XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm355XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +356,SpatialTransformer-Proj_in356,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin356einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin356einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +357,BasicTransformerBlock-Input_layernorm357,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm357XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm357XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-Q-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Q358MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358Q358MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-K-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358K358MatMulK,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358K358MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-V-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358V358MatMulV,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358V358MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +359,SelfAttention358-FlashAttention-359,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention358FlashAttention359FlashAttention,MXU,1,Compute,34996,34996,5426,0,0,0,0,0,0,0,0,34996,13070,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention358FlashAttention359FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,8714,34996,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9383,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.629268716424734,139.52487427134528,0.06791602378210453,0.15502763807927253,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +360,SelfAttention358-Attention_output-360,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +361,SelfAttention358-Attention_layernorm-361,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention358Attentionlayernorm361YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention358Attentionlayernorm361YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +362,GatedSelfAttention-Linear362,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear362XLinearcontext,MXU,1,Memory,1041,255,1041,0,0,0,0,0,0,0,0,0,255,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear362XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,255,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.554582132564842,899.623610673331,0.05945778218756329,0.9995817896370344,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-Q-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Q363MatMulQ,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362Q363MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-K-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362K363MatMulK,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362K363MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-V-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362V363MatMulV,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362V363MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +364,GatedSelfAttention-Attn362-FlashAttention-364,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn362FlashAttention364FlashAttention,MXU,1,Compute,44256,44256,5468,0,0,0,0,0,0,0,0,44256,14366,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn362FlashAttention364FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,8852,44256,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.930741865509762,111.19304665257253,0.05454789331912377,0.12354782961396948,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +365,GatedSelfAttention-Attn362-Attention_output-365,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +366,GatedSelfAttention-Attn362-Attention_layernorm-366,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,VPU,1,Memory,2734,1373,2734,0,0,0,0,0,0,0,0,0,1373,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1373,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,662,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326408193123628,899.9560118244788,0.5019533377951407,0.9999511242494209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +367,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +368,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +369,GatedSelfAttention-FFN362Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +370,BasicTransformerBlock-Fuser_output_layernorm370,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-Q-371,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Q371MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention371Q371MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-K-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371K371MatMulK,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371K371MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-V-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371V371MatMulV,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371V371MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +372,CrossAttention371-FlashAttention-372,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention371FlashAttention372FlashAttention,MXU,1,Compute,17566,17566,4070,0,0,0,0,0,0,0,0,17566,6535,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention371FlashAttention372FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,4357,17566,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.595863827849254,208.4771362290789,0.06765311306724724,0.23164126247675432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +373,CrossAttention371-Attention_output-373,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +374,CrossAttention371-Attention_layernorm-374,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention371Attentionlayernorm374YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention371Attentionlayernorm374YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +375,BasicTransformerBlock-Attn_output_layernorm375,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +376,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +377,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +378,BasicTransformerBlock-FFN376Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +379,SpatialTransformer-Proj_out379,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout379einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout379einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +380,Time-Embed-MLP-Einsum380,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum380einsum,MXU,1,Memory,1700,425,1700,0,0,0,0,0,0,0,0,0,425,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum380einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,425,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9637647058823529,899.6795205508961,0.007585239124663404,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +381,Conv2d-GroupNorm381,"GroupNorm(x=1x1280x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm381XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,1280,32,32]","[DT_BFLOAT16:(1,1280,32,32)]",10485760,Conv2dGroupNorm381XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +382,Conv2d381Conv2d,"Conv2D(a=1x1280x32x32,b=1280x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d381Conv2dconv2d,MXU,1,Compute,122622,122622,19328,0,0,0,0,0,0,0,0,122622,15319,0,0,18677760,"DT_BFLOAT16:[1,1280,32,32],DT_BFLOAT16:[1280,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",15099494400,Conv2d381Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 32, 32], [1280, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,3600,1,640,1024,11520,0,122622,18677760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,32914,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.13854283896853,141.85887957503547,0.9691528307638637,0.15762097730559496,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +383,Conv2d-GroupNorm383,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm383XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm383XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +384,Conv2d383Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d383Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d383Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +385,SkipConnection-Einsum380,"XlaEinsum(a=1x32x32x1280,b=1280x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum380einsum,MXU,1,Compute,13686,13686,5765,0,0,0,0,0,0,0,0,13686,1702,0,0,5570560,"DT_BFLOAT16:[1,32,32,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,32,32,640)]",1677721600,SkipConnectionEinsum380einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1280], [1280, 640], [1, 32, 32, 640]]",1,4718592,400,1,1024,640,1280,0,13686,5570560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4095,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.58670173900336,379.072649514102,0.9648096060363917,0.4211918327934467,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +386,SpatialTransformer-Input_GroupNorm386,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm386XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm386XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +387,SpatialTransformer-Proj_in387,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin387einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin387einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +388,BasicTransformerBlock-Input_layernorm388,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm388XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm388XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-Q-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Q389MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389Q389MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-K-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389K389MatMulK,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389K389MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-V-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389V389MatMulV,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389V389MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +390,SelfAttention389-FlashAttention-390,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention389FlashAttention390FlashAttention,MXU,1,Compute,34996,34996,5426,0,0,0,0,0,0,0,0,34996,13070,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention389FlashAttention390FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,8714,34996,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9383,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.629268716424734,139.52487427134528,0.06791602378210453,0.15502763807927253,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +391,SelfAttention389-Attention_output-391,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +392,SelfAttention389-Attention_layernorm-392,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention389Attentionlayernorm392YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention389Attentionlayernorm392YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +393,GatedSelfAttention-Linear393,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear393XLinearcontext,MXU,1,Memory,1041,255,1041,0,0,0,0,0,0,0,0,0,255,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear393XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,255,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.554582132564842,899.623610673331,0.05945778218756329,0.9995817896370344,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-Q-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Q394MatMulQ,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393Q394MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-K-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393K394MatMulK,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393K394MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-V-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393V394MatMulV,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393V394MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +395,GatedSelfAttention-Attn393-FlashAttention-395,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn393FlashAttention395FlashAttention,MXU,1,Compute,44256,44256,5468,0,0,0,0,0,0,0,0,44256,14366,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn393FlashAttention395FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,8852,44256,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.930741865509762,111.19304665257253,0.05454789331912377,0.12354782961396948,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +396,GatedSelfAttention-Attn393-Attention_output-396,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +397,GatedSelfAttention-Attn393-Attention_layernorm-397,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,VPU,1,Memory,2734,1373,2734,0,0,0,0,0,0,0,0,0,1373,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1373,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,662,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326408193123628,899.9560118244788,0.5019533377951407,0.9999511242494209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +398,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +399,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +400,GatedSelfAttention-FFN393Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +401,BasicTransformerBlock-Fuser_output_layernorm401,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-Q-402,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Q402MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention402Q402MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-K-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402K402MatMulK,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402K402MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-V-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402V402MatMulV,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402V402MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +403,CrossAttention402-FlashAttention-403,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention402FlashAttention403FlashAttention,MXU,1,Compute,17566,17566,4070,0,0,0,0,0,0,0,0,17566,6535,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention402FlashAttention403FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,4357,17566,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.595863827849254,208.4771362290789,0.06765311306724724,0.23164126247675432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +404,CrossAttention402-Attention_output-404,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +405,CrossAttention402-Attention_layernorm-405,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention402Attentionlayernorm405YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention402Attentionlayernorm405YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +406,BasicTransformerBlock-Attn_output_layernorm406,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +407,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +408,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +409,BasicTransformerBlock-FFN407Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +410,SpatialTransformer-Proj_out410,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout410einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout410einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +411,Time-Embed-MLP-Einsum411,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum411einsum,MXU,1,Memory,1700,425,1700,0,0,0,0,0,0,0,0,0,425,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum411einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,425,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9637647058823529,899.6795205508961,0.007585239124663404,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +412,Conv2d-GroupNorm412,"GroupNorm(x=1x960x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm412XnormGroupNormX,VPU,1,Memory,4070,2043,4070,0,0,0,0,0,0,0,0,0,2043,0,0,3932160,"DT_BFLOAT16:[1,960,32,32]","[DT_BFLOAT16:(1,960,32,32)]",7864320,Conv2dGroupNorm412XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2043,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,986,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9322653562653562,899.7811732186732,0.5018558210047572,0.9997568591318591,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +413,Conv2d412Conv2d,"Conv2D(a=1x960x32x32,b=960x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d412Conv2dconv2d,MXU,1,Compute,92664,92664,14835,0,0,0,0,0,0,0,0,92664,11574,0,0,14336000,"DT_BFLOAT16:[1,960,32,32],DT_BFLOAT16:[960,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",11324620800,Conv2d412Conv2dconv2d,Conv2D,11059200,[],Conv2D,bf01;io01->bf01,"[[1, 960, 32, 32], [960, 640, 3, 3], [1, 640, 32, 32]]",1,14589440,2720,1,640,1024,8640,0,92664,14336000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24899,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.21165501165501,144.084438721483,0.961857828395546,0.16009382080164777,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +414,Conv2d-GroupNorm414,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm414XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm414XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +415,Conv2d414Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d414Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d414Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +416,SkipConnection-Einsum411,"XlaEinsum(a=1x32x32x960,b=960x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum411einsum,MXU,1,Compute,10962,10962,4663,0,0,0,0,0,0,0,0,10962,1361,0,0,4505600,"DT_BFLOAT16:[1,32,32,960],DT_BFLOAT16:[960,640]","[DT_BFLOAT16:(1,32,32,640)]",1258291200,SkipConnectionEinsum411einsum,Einsum,1228800,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 960], [960, 640], [1, 32, 32, 640]]",1,4505600,320,1,1024,640,960,0,10962,4505600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,114.78664477285167,382.7920992690659,0.9034198322532879,0.4253245547434065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +417,SpatialTransformer-Input_GroupNorm417,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm417XnormGroupNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm417XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +418,SpatialTransformer-Proj_in418,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin418einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin418einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +419,BasicTransformerBlock-Input_layernorm419,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm419XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm419XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-Q-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Q420MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420Q420MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-K-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420K420MatMulK,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420K420MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-V-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420V420MatMulV,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420V420MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +421,SelfAttention420-FlashAttention-421,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention420FlashAttention421FlashAttention,MXU,1,Compute,34996,34996,5426,0,0,0,0,0,0,0,0,34996,13070,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention420FlashAttention421FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,8714,34996,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9383,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.629268716424734,139.52487427134528,0.06791602378210453,0.15502763807927253,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +422,SelfAttention420-Attention_output-422,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +423,SelfAttention420-Attention_layernorm-423,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention420Attentionlayernorm423YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention420Attentionlayernorm423YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +424,GatedSelfAttention-Linear424,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear424XLinearcontext,MXU,1,Memory,1041,255,1041,0,0,0,0,0,0,0,0,0,255,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear424XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,255,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.554582132564842,899.623610673331,0.05945778218756329,0.9995817896370344,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-Q-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Q425MatMulQ,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424Q425MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-K-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424K425MatMulK,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424K425MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-V-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424V425MatMulV,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424V425MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +426,GatedSelfAttention-Attn424-FlashAttention-426,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn424FlashAttention426FlashAttention,MXU,1,Compute,44256,44256,5468,0,0,0,0,0,0,0,0,44256,14366,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn424FlashAttention426FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,8852,44256,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.930741865509762,111.19304665257253,0.05454789331912377,0.12354782961396948,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +427,GatedSelfAttention-Attn424-Attention_output-427,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,MXU,1,Compute,7728,7728,3582,0,0,0,0,0,0,0,0,7728,957,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,7728,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,109.39627329192547,417.1091083660876,0.8609953105790294,0.46345456485120845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +428,GatedSelfAttention-Attn424-Attention_layernorm-428,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,VPU,1,Memory,2734,1373,2734,0,0,0,0,0,0,0,0,0,1373,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1373,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,662,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326408193123628,899.9560118244788,0.5019533377951407,0.9999511242494209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +429,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +430,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +431,GatedSelfAttention-FFN424Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +432,BasicTransformerBlock-Fuser_output_layernorm432,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-Q-433,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Q433MatMulQ,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention433Q433MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-K-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433K433MatMulK,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433K433MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-V-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433V433MatMulV,MXU,1,Compute,4154,4154,2510,0,0,0,0,0,0,0,0,4154,510,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433V433MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,4154,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.16429465575348,543.6448678984111,0.9536146558652423,0.6040498532204568,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +434,CrossAttention433-FlashAttention-434,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention433FlashAttention434FlashAttention,MXU,1,Compute,17566,17566,4070,0,0,0,0,0,0,0,0,17566,6535,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention433FlashAttention434FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,4357,17566,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.595863827849254,208.4771362290789,0.06765311306724724,0.23164126247675432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +435,CrossAttention433-Attention_output-435,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +436,CrossAttention433-Attention_layernorm-436,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention433Attentionlayernorm436YnormLayerNormy,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention433Attentionlayernorm436YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +437,BasicTransformerBlock-Attn_output_layernorm437,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,VPU,1,Memory,2713,1362,2713,0,0,0,0,0,0,0,0,0,1362,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1362,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +438,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +439,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,27303,27303,10173,0,0,0,0,0,0,0,0,27303,3404,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,27303,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,122.89650221587372,335.3211528952862,0.9672478678690296,0.3725790587725402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +440,BasicTransformerBlock-FFN438Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2713,171,2713,0,0,0,0,0,0,0,0,0,171,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,171,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +441,SpatialTransformer-Proj_out441,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout441einsum,MXU,1,Compute,6877,6877,3561,0,0,0,0,0,0,0,0,6877,851,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout441einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,6877,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,121.9806310891377,465.9510983168533,0.9600395716310932,0.5177234425742814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample442,"Upsample(a=1x640x32x32,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample442Upsample,VPU,1,Memory,6782,0,6782,0,0,0,0,0,0,0,0,0,0,0,0,6553600,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,64,64)]",0,Upsample442Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,792,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,899.958069153642,0.0,0.9999534101707133,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample-Conv2d442Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d442Conv2dconv2d,MXU,1,Compute,61345,61345,10343,0,0,0,0,0,0,0,0,61345,7659,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,UpsampleConv2d442Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,61345,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.07029423750917,151.72974697408102,0.9686156851734168,0.1685886077489789,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +443,Time-Embed-MLP-Einsum443,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum443einsum,MXU,1,Memory,852,212,852,0,0,0,0,0,0,0,0,0,212,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum443einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,212,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9615023474178404,898.9667668588844,0.007567433399018655,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +444,Conv2d-GroupNorm444,"GroupNorm(x=1x960x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm444XnormGroupNormX,VPU,1,Memory,16277,8171,16277,0,0,0,0,0,0,0,0,0,8171,0,0,15728640,"DT_BFLOAT16:[1,960,64,64]","[DT_BFLOAT16:(1,960,64,64)]",31457280,Conv2dGroupNorm444XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,8171,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3945,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326214904466426,899.9470111199853,0.5019483176234839,0.9999411234666503,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +445,Conv2d444Conv2d,"Conv2D(a=1x960x64x64,b=960x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d444Conv2dconv2d,MXU,1,Compute,222298,222298,16573,0,0,0,0,0,0,0,0,222298,27778,0,0,16015360,"DT_BFLOAT16:[1,960,64,64],DT_BFLOAT16:[960,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",22649241600,Conv2d444Conv2dconv2d,Conv2D,5529600,[],Conv2D,bf01;io01->bf01,"[[1, 960, 64, 64], [960, 320, 3, 3], [1, 320, 64, 64]]",1,16514560,6528,1,320,4096,8640,0,222298,16015360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,57511,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,101.88684378626887,67.09671840769485,0.8018928988155077,0.07455190934188317,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +446,Conv2d-GroupNorm446,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm446XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm446XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +447,Conv2d446Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d446Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d446Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,SkipConnection-Einsum443,"XlaEinsum(a=1x64x64x960,b=960x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum443einsum,MXU,1,Compute,26213,26213,11487,0,0,0,0,0,0,0,0,26213,3268,0,0,11100160,"DT_BFLOAT16:[1,64,64,960],DT_BFLOAT16:[960,320]","[DT_BFLOAT16:(1,64,64,320)]",2516582400,SkipConnectionEinsum443einsum,Einsum,614400,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 960], [960, 320], [1, 64, 64, 320]]",1,11100160,768,1,4096,320,960,0,26213,11100160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,96.00512722694846,394.37796474435396,0.7556012818952842,0.43819773860483774,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +449,SpatialTransformer-Input_GroupNorm449,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm449XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm449XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +450,SpatialTransformer-Proj_in450,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin450einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin450einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +451,BasicTransformerBlock-Input_layernorm451,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm451XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm451XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-Q-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Q452MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452Q452MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-K-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452K452MatMulK,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452K452MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-V-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452V452MatMulV,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452V452MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +453,SelfAttention452-FlashAttention-453,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention452FlashAttention453FlashAttention,MXU,1,Compute,557890,557890,10851,0,0,0,0,0,0,0,0,557890,209156,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention452FlashAttention453FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,139438,557890,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,140740,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.660915607019305,17.504570793525605,0.06816509830335099,0.01944952310391734,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +454,SelfAttention452-Attention_output-454,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +455,SelfAttention452-Attention_layernorm-455,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention452Attentionlayernorm455YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention452Attentionlayernorm455YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +456,GatedSelfAttention-Linear456,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear456XLinearcontext,MXU,1,Memory,527,127,527,0,0,0,0,0,0,0,0,0,127,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear456XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,127,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.461404174573055,899.3854558897651,0.05872443193287797,0.9993171732108501,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-Q-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Q457MatMulQ,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456Q457MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-K-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456K457MatMulK,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456K457MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-V-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456V457MatMulV,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456V457MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +458,GatedSelfAttention-Attn456-FlashAttention-458,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn456FlashAttention458FlashAttention,MXU,1,Compute,593294,593294,10872,0,0,0,0,0,0,0,0,593294,214128,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn456FlashAttention458FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,139984,593294,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.175931339268558,16.492158164970697,0.06434806534900428,0.018324620183300774,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +459,GatedSelfAttention-Attn456-Attention_output-459,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +460,GatedSelfAttention-Attn456-Attention_layernorm-460,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,VPU,1,Memory,5436,2729,5436,0,0,0,0,0,0,0,0,0,2729,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2729,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1317,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327152317880796,899.9906628337128,0.5019726645061294,0.9999896253707921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +461,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +462,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +463,GatedSelfAttention-FFN456Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +464,BasicTransformerBlock-Fuser_output_layernorm464,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-Q-465,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Q465MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention465Q465MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-K-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465K465MatMulK,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465K465MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-V-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465V465MatMulV,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465V465MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +466,CrossAttention465-FlashAttention-466,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention465FlashAttention466FlashAttention,MXU,1,Compute,69856,69856,6104,0,0,0,0,0,0,0,0,69856,26143,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention465FlashAttention466FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,17429,69856,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.646068712780577,78.63553685438616,0.06804824691589928,0.08737281872709574,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +467,CrossAttention465-Attention_output-467,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +468,CrossAttention465-Attention_layernorm-468,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention465Attentionlayernorm468YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention465Attentionlayernorm468YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +469,BasicTransformerBlock-Attn_output_layernorm469,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +470,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +471,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +472,BasicTransformerBlock-FFN470Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +473,SpatialTransformer-Proj_out473,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout473einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout473einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +474,Time-Embed-MLP-Einsum474,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum474einsum,MXU,1,Memory,852,212,852,0,0,0,0,0,0,0,0,0,212,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum474einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,212,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9615023474178404,898.9667668588844,0.007567433399018655,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +475,Conv2d-GroupNorm475,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm475XnormGroupNormX,VPU,1,Memory,10851,5447,10851,0,0,0,0,0,0,0,0,0,5447,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm475XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,5447,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2629,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326808589070132,899.974656713667,0.5019637370415905,0.9999718407929633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +476,Conv2d475Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d475Conv2dconv2d,MXU,1,Compute,147132,147132,11953,0,0,0,0,0,0,0,0,147132,18382,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d475Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,147132,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.6254954734524,73.11425311327584,0.8077064025088118,0.08123805901475092,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +477,Conv2d-GroupNorm477,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm477XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm477XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +478,Conv2d477Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d477Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d477Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +479,SkipConnection-Einsum474,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum474einsum,MXU,1,Compute,16409,16409,8562,0,0,0,0,0,0,0,0,16409,2042,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum474einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,16409,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.24398805533548,469.601345393534,0.8047037764771805,0.5217792726594822,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +480,SpatialTransformer-Input_GroupNorm480,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm480XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm480XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +481,SpatialTransformer-Proj_in481,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin481einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin481einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +482,BasicTransformerBlock-Input_layernorm482,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm482XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm482XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-Q-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Q483MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483Q483MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-K-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483K483MatMulK,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483K483MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-V-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483V483MatMulV,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483V483MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +484,SelfAttention483-FlashAttention-484,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention483FlashAttention484FlashAttention,MXU,1,Compute,557890,557890,10851,0,0,0,0,0,0,0,0,557890,209156,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention483FlashAttention484FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,139438,557890,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,140740,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.660915607019305,17.504570793525605,0.06816509830335099,0.01944952310391734,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +485,SelfAttention483-Attention_output-485,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +486,SelfAttention483-Attention_layernorm-486,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention483Attentionlayernorm486YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention483Attentionlayernorm486YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +487,GatedSelfAttention-Linear487,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear487XLinearcontext,MXU,1,Memory,527,127,527,0,0,0,0,0,0,0,0,0,127,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear487XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,127,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.461404174573055,899.3854558897651,0.05872443193287797,0.9993171732108501,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-Q-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Q488MatMulQ,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487Q488MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-K-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487K488MatMulK,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487K488MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-V-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487V488MatMulV,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487V488MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +489,GatedSelfAttention-Attn487-FlashAttention-489,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn487FlashAttention489FlashAttention,MXU,1,Compute,593294,593294,10872,0,0,0,0,0,0,0,0,593294,214128,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn487FlashAttention489FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,139984,593294,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.175931339268558,16.492158164970697,0.06434806534900428,0.018324620183300774,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +490,GatedSelfAttention-Attn487-Attention_output-490,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +491,GatedSelfAttention-Attn487-Attention_layernorm-491,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,VPU,1,Memory,5436,2729,5436,0,0,0,0,0,0,0,0,0,2729,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2729,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1317,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327152317880796,899.9906628337128,0.5019726645061294,0.9999896253707921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +492,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +493,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +494,GatedSelfAttention-FFN487Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +495,BasicTransformerBlock-Fuser_output_layernorm495,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-Q-496,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Q496MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention496Q496MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-K-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496K496MatMulK,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496K496MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-V-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496V496MatMulV,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496V496MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +497,CrossAttention496-FlashAttention-497,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention496FlashAttention497FlashAttention,MXU,1,Compute,69856,69856,6104,0,0,0,0,0,0,0,0,69856,26143,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention496FlashAttention497FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,17429,69856,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.646068712780577,78.63553685438616,0.06804824691589928,0.08737281872709574,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +498,CrossAttention496-Attention_output-498,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +499,CrossAttention496-Attention_layernorm-499,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention496Attentionlayernorm499YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention496Attentionlayernorm499YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +500,BasicTransformerBlock-Attn_output_layernorm500,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +501,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +502,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +503,BasicTransformerBlock-FFN501Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +504,SpatialTransformer-Proj_out504,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout504einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout504einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +505,Time-Embed-MLP-Einsum505,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum505einsum,MXU,1,Memory,852,212,852,0,0,0,0,0,0,0,0,0,212,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum505einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,212,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9615023474178404,898.9667668588844,0.007567433399018655,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +506,Conv2d-GroupNorm506,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm506XnormGroupNormX,VPU,1,Memory,10851,5447,10851,0,0,0,0,0,0,0,0,0,5447,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm506XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,5447,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2629,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326808589070132,899.974656713667,0.5019637370415905,0.9999718407929633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +507,Conv2d506Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d506Conv2dconv2d,MXU,1,Compute,147132,147132,11953,0,0,0,0,0,0,0,0,147132,18382,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d506Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,147132,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.6254954734524,73.11425311327584,0.8077064025088118,0.08123805901475092,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +508,Conv2d-GroupNorm508,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm508XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm508XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +509,Conv2d508Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d508Conv2dconv2d,MXU,1,Compute,75235,75235,7333,0,0,0,0,0,0,0,0,75235,9395,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d508Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,75235,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19665,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,100.34886954210143,87.71750208721008,0.7897883858172826,0.0974638912080112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +510,SkipConnection-Einsum505,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum505einsum,MXU,1,Compute,16409,16409,8562,0,0,0,0,0,0,0,0,16409,2042,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum505einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,16409,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.24398805533548,469.601345393534,0.8047037764771805,0.5217792726594822,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +511,SpatialTransformer-Input_GroupNorm511,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm511XnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm511XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +512,SpatialTransformer-Proj_in512,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin512einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin512einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +513,BasicTransformerBlock-Input_layernorm513,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm513XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm513XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-Q-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Q514MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514Q514MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-K-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514K514MatMulK,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514K514MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-V-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514V514MatMulV,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514V514MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +515,SelfAttention514-FlashAttention-515,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention514FlashAttention515FlashAttention,MXU,1,Compute,557890,557890,10851,0,0,0,0,0,0,0,0,557890,209156,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention514FlashAttention515FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,139438,557890,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,140740,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.660915607019305,17.504570793525605,0.06816509830335099,0.01944952310391734,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +516,SelfAttention514-Attention_output-516,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +517,SelfAttention514-Attention_layernorm-517,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention514Attentionlayernorm517YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention514Attentionlayernorm517YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +518,GatedSelfAttention-Linear518,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear518XLinearcontext,MXU,1,Memory,527,127,527,0,0,0,0,0,0,0,0,0,127,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear518XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,127,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.461404174573055,899.3854558897651,0.05872443193287797,0.9993171732108501,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-Q-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Q519MatMulQ,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518Q519MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-K-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518K519MatMulK,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518K519MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-V-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518V519MatMulV,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518V519MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +520,GatedSelfAttention-Attn518-FlashAttention-520,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn518FlashAttention520FlashAttention,MXU,1,Compute,593294,593294,10872,0,0,0,0,0,0,0,0,593294,214128,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn518FlashAttention520FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,139984,593294,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.175931339268558,16.492158164970697,0.06434806534900428,0.018324620183300774,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +521,GatedSelfAttention-Attn518-Attention_output-521,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,MXU,1,Compute,10179,10179,5648,0,0,0,0,0,0,0,0,10179,1263,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,10179,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3204,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,82.57188328912467,499.3696931373723,0.6498759250043183,0.5548552145970803,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +522,GatedSelfAttention-Attn518-Attention_layernorm-522,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,VPU,1,Memory,5436,2729,5436,0,0,0,0,0,0,0,0,0,2729,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2729,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1317,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327152317880796,899.9906628337128,0.5019726645061294,0.9999896253707921,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +523,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +524,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +525,GatedSelfAttention-FFN518Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +526,BasicTransformerBlock-Fuser_output_layernorm526,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-Q-527,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Q527MatMulQ,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention527Q527MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-K-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527K527MatMulK,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527K527MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-V-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527V527MatMulV,MXU,1,Compute,2520,2520,1662,0,0,0,0,0,0,0,0,2520,306,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527V527MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,2520,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,99.86438095238096,593.3973524305555,0.7859752540603605,0.6593303915895061,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +528,CrossAttention527-FlashAttention-528,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention527FlashAttention528FlashAttention,MXU,1,Compute,69856,69856,6104,0,0,0,0,0,0,0,0,69856,26143,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention527FlashAttention528FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,17429,69856,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,8.646068712780577,78.63553685438616,0.06804824691589928,0.08737281872709574,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +529,CrossAttention527-Attention_output-529,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +530,CrossAttention527-Attention_layernorm-530,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention527Attentionlayernorm530YnormLayerNormy,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention527Attentionlayernorm530YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +531,BasicTransformerBlock-Attn_output_layernorm531,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +532,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +533,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,32749,32749,14412,0,0,0,0,0,0,0,0,32749,4085,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,32749,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,102.45940944761672,396.0417326674097,0.8063992346767264,0.44004636963045524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +534,BasicTransformerBlock-FFN532Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,5426,341,5426,0,0,0,0,0,0,0,0,0,341,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,341,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,719,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24156284555842242,899.8917250276447,0.06273968520362949,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +535,SpatialTransformer-Proj_out535,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout535einsum,MXU,1,Compute,9873,9873,5638,0,0,0,0,0,0,0,0,9873,1225,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout535einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,9873,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,84.96513724298592,513.8810253500709,0.6687118539559432,0.5709789170556343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +536,Out536-GroupNorm,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Out536GroupNormXnormGroupNormX,VPU,1,Memory,5426,2724,5426,0,0,0,0,0,0,0,0,0,2724,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Out536GroupNormXnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2724,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1315,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9325027644673793,899.8917250276447,0.5019174816290359,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +537,Out537-Conv2d,"Conv2D(a=1x320x64x64,b=320x3x3x3,c=1x3x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Out537Conv2dconv2d,MXU,1,Compute,25124,25124,2756,0,0,0,0,0,0,0,0,25124,3131,0,0,2663296,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,3,3,3]","[DT_BFLOAT16:(1,3,64,64)]",70778880,Out537Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 3, 3, 3], [1, 3, 64, 64]]",1,2829696,736,1,3,4096,2880,0,25124,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.8171819773921354,98.72582740340334,0.02217242323337369,0.10969536378155927,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json new file mode 100644 index 0000000..4b2fb9a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.json @@ -0,0 +1,184 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 14442946, + "overlapped_compute_time_non_pp_ns": 3734525, + "compute_only_time_non_pp_ns": 10276028, + "memory_only_time_non_pp_ns": 432393, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 14442946, + "overlapped_compute_time_chip_ns": 3734525, + "compute_only_time_chip_ns": 10276028, + "memory_only_time_chip_ns": 432393, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 1.7309487967344057, + "throughput_step_per_sec_per_request": 69.23795186937623, + "latency_sec": 0.57771784, + "latency_step_sec": 0.00036107365, + "mem_footprint_GB": 31.999999046325684, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "gligen", + "model_type": "gligen", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 2, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "num_diffusion_steps": 1, + "total_num_diffusion_steps": 40, + "image_resolution": [ + 512, + 512 + ], + "image_num_channels": 3, + "use_flash_attention": true, + "fourier_embedder_config": { + "num_freqs": 64 + }, + "text_embedder_config": { + "d_model": 512, + "num_heads": 8, + "d_head": 64, + "d_ff": 2048, + "num_layers": 12, + "ffn_type": "default" + }, + "image_embedder_config": { + "model_type": "vit", + "patch_size": 2, + "d_model": 1024, + "num_heads": 16, + "d_head": 64, + "d_ff": 4096, + "num_layers": 24, + "ffn_type": "default" + }, + "spatial_condition_embedder_config": { + "model_type": "convnext", + "stem": { + "in_channels": 3, + "out_channels": 96, + "kernel_size": 4, + "stride": 4 + }, + "depths": [ + 3, + 3, + 9, + 3 + ], + "dims": [ + 96, + 192, + 384, + 768 + ] + }, + "grounding_input_config": { + "text": { + "input_seqlen": 512, + "feature_dim": 768 + }, + "bbox": { + "input_seqlen": 8, + "feature_dim": 4, + "grounding_token_feature_dim": 768 + }, + "image": { + "resolution": [ + 1024, + 1024 + ], + "image_num_channels": 3 + }, + "keypoint": { + "num_persons": 10, + "num_keypoints": 17, + "feature_dim": 256 + }, + "spatial_condition": { + "resolution": [ + 256, + 256 + ], + "num_channels": 1 + } + }, + "unet_config": { + "noisy_latent_resolution": [ + 64, + 64 + ], + "model_channels": 320, + "attention_resolutions": [ + 4, + 2, + 1 + ], + "num_res_blocks": 2, + "channel_mult": [ + 1, + 2, + 4, + 4 + ], + "num_heads": 8, + "context_dim": 768 + }, + "output_dir": "./llava_ops" + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv new file mode 100644 index 0000000..de046cf --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv @@ -0,0 +1,635 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Time-Embed-MLP-FFi2,"XlaEinsum(a=1x320,b=320x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFi2einsum,MXU,1,Memory,639,488,639,0,0,0,0,0,0,0,0,488,114,0,0,822400,"DT_BFLOAT16:[1,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,1280)]",819200,TimeEmbedMLPFFi2einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 320], [320, 1280], [1, 1280]]",1,822400,30,1,1,1280,320,0,488,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.282003129890454,1198.6223558118459,0.004585920078877826,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Time-Embed-MLP-FFo2,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFo2einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPFFo2einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Conv2d5Conv2d,"Conv2D(a=1x3x64x64,b=3x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d5Conv2dconv2d,MXU,1,Memory,2067,1494,2067,0,0,0,0,0,0,0,0,1494,365,0,0,2663296,"DT_BFLOAT16:[1,3,64,64],DT_BFLOAT16:[3,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",70778880,Conv2d5Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 3, 64, 64], [3, 320, 3, 3], [1, 320, 64, 64]]",1,2664856,96,1,320,4096,27,0,1494,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,517,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,34.24232220609579,1199.994043388053,0.122489991865899,0.9999950361567107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Time-Embed-MLP-Einsum6,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum6einsum,MXU,1,Memory,639,488,639,0,0,0,0,0,0,0,0,488,114,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum6einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,488,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.282003129890454,1198.6223558118459,0.004585920078877826,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Conv2d-GroupNorm7,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm7XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm7XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Conv2d7Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d7Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d7Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Conv2d-GroupNorm9,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm9XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm9XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Conv2d9Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d9Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d9Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,SpatialTransformer-Input_GroupNorm11,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm11XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm11XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,SpatialTransformer-Proj_in12,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin12einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin12einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,BasicTransformerBlock-Input_layernorm13,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm13XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm13XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-Q-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Q14MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14Q14MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-K-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14K14MatMulK,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14K14MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-V-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14V14MatMulV,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14V14MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,SelfAttention14-FlashAttention-15,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention14FlashAttention15FlashAttention,MXU,1,Compute,249722,249722,8139,0,0,0,0,0,0,0,0,249722,187244,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention14FlashAttention15FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,124830,249722,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,62998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.348868774076774,39.10598585627218,0.0692138449164262,0.03258832154689348,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,SelfAttention14-Attention_output-16,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,SelfAttention14-Attention_layernorm-17,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention14Attentionlayernorm17YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention14Attentionlayernorm17YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,GatedSelfAttention-Linear18,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear18XLinearcontext,MXU,1,Memory,500,305,500,0,0,0,0,0,0,0,0,305,68,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear18XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,305,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.028131868131868135,0.7899602254231771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-Q-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Q19MatMulQ,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18Q19MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-K-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18K19MatMulK,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18K19MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-V-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18V19MatMulV,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18V19MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,GatedSelfAttention-Attn18-FlashAttention-20,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn18FlashAttention20FlashAttention,MXU,1,Compute,265570,265570,8154,0,0,0,0,0,0,0,0,265570,191695,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn18FlashAttention20FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,125319,265570,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,18.265357562977748,36.84414085298838,0.06533796060474527,0.03070345071082365,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,GatedSelfAttention-Attn18-Attention_output-21,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,GatedSelfAttention-Attn18-Attention_layernorm-22,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,VPU,1,Memory,4077,2443,4077,0,0,0,0,0,0,0,0,0,2443,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2443,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576953642384106,1199.9875504449503,0.5991800693787449,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,GatedSelfAttention-FFN18Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,BasicTransformerBlock-Fuser_output_layernorm26,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-Q-27,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Q27MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention27Q27MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-K-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27K27MatMulK,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27K27MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-V-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27V27MatMulV,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27V27MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,CrossAttention27-FlashAttention-28,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention27FlashAttention28FlashAttention,MXU,1,Compute,31270,31270,4578,0,0,0,0,0,0,0,0,31270,23403,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention27FlashAttention28FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,15603,31270,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.31499123760793,175.66882195394948,0.06909265981859523,0.14639068496162455,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,CrossAttention27-Attention_output-29,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,CrossAttention27-Attention_layernorm-30,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention27Attentionlayernorm30YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention27Attentionlayernorm30YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +31,BasicTransformerBlock-Attn_output_layernorm31,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +32,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +33,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +34,BasicTransformerBlock-FFN32Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +35,SpatialTransformer-Proj_out35,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout35einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout35einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +36,Time-Embed-MLP-Einsum36,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum36einsum,MXU,1,Memory,639,488,639,0,0,0,0,0,0,0,0,488,114,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum36einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,488,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.282003129890454,1198.6223558118459,0.004585920078877826,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +37,Conv2d-GroupNorm37,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm37XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm37XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +38,Conv2d37Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d37Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d37Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +39,Conv2d-GroupNorm39,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm39XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm39XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +40,Conv2d39Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d39Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d39Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +41,SpatialTransformer-Input_GroupNorm41,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm41XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm41XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +42,SpatialTransformer-Proj_in42,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin42einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin42einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +43,BasicTransformerBlock-Input_layernorm43,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm43XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm43XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-Q-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Q44MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44Q44MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-K-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44K44MatMulK,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44K44MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-V-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44V44MatMulV,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44V44MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +45,SelfAttention44-FlashAttention-45,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention44FlashAttention45FlashAttention,MXU,1,Compute,249722,249722,8139,0,0,0,0,0,0,0,0,249722,187244,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention44FlashAttention45FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,124830,249722,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,62998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.348868774076774,39.10598585627218,0.0692138449164262,0.03258832154689348,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +46,SelfAttention44-Attention_output-46,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +47,SelfAttention44-Attention_layernorm-47,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention44Attentionlayernorm47YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention44Attentionlayernorm47YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +48,GatedSelfAttention-Linear48,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear48XLinearcontext,MXU,1,Memory,500,305,500,0,0,0,0,0,0,0,0,305,68,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear48XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,305,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.028131868131868135,0.7899602254231771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-Q-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Q49MatMulQ,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48Q49MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-K-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48K49MatMulK,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48K49MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-V-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48V49MatMulV,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48V49MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +50,GatedSelfAttention-Attn48-FlashAttention-50,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn48FlashAttention50FlashAttention,MXU,1,Compute,265570,265570,8154,0,0,0,0,0,0,0,0,265570,191695,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn48FlashAttention50FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,125319,265570,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,18.265357562977748,36.84414085298838,0.06533796060474527,0.03070345071082365,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +51,GatedSelfAttention-Attn48-Attention_output-51,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +52,GatedSelfAttention-Attn48-Attention_layernorm-52,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,VPU,1,Memory,4077,2443,4077,0,0,0,0,0,0,0,0,0,2443,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2443,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576953642384106,1199.9875504449503,0.5991800693787449,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +53,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +54,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +55,GatedSelfAttention-FFN48Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,BasicTransformerBlock-Fuser_output_layernorm56,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-Q-57,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Q57MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention57Q57MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-K-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57K57MatMulK,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57K57MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-V-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57V57MatMulV,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57V57MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +58,CrossAttention57-FlashAttention-58,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention57FlashAttention58FlashAttention,MXU,1,Compute,31270,31270,4578,0,0,0,0,0,0,0,0,31270,23403,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention57FlashAttention58FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,15603,31270,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.31499123760793,175.66882195394948,0.06909265981859523,0.14639068496162455,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +59,CrossAttention57-Attention_output-59,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +60,CrossAttention57-Attention_layernorm-60,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention57Attentionlayernorm60YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention57Attentionlayernorm60YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +61,BasicTransformerBlock-Attn_output_layernorm61,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +62,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +63,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +64,BasicTransformerBlock-FFN62Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +65,SpatialTransformer-Proj_out65,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout65einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout65einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +66,Downsample-Conv2d66Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d66Conv2dconv2d,MXU,1,Compute,8442,8442,3974,0,0,0,0,0,0,0,0,8442,2102,0,0,5120000,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,32,32)]",1887436800,DownsampleConv2d66Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 32, 32]]",1,5286400,552,1,320,1024,2880,0,8442,5120000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2387,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,223.5769722814499,564.8390881344765,0.7997688168263863,0.4706992401120637,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +67,Time-Embed-MLP-Einsum67,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum67einsum,MXU,1,Memory,1275,793,1275,0,0,0,0,0,0,0,0,793,190,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum67einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,793,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2850196078431373,1199.5726940678614,0.004596710479063421,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +68,Conv2d-GroupNorm68,"GroupNorm(x=1x320x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm68XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,320,32,32]","[DT_BFLOAT16:(1,320,32,32)]",2621440,Conv2dGroupNorm68XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +69,Conv2d68Conv2d,"Conv2D(a=1x320x32x32,b=320x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d68Conv2dconv2d,MXU,1,Compute,14050,14050,4387,0,0,0,0,0,0,0,0,14050,3504,0,0,5652480,"DT_BFLOAT16:[1,320,32,32],DT_BFLOAT16:[320,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",3774873600,Conv2d68Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 320, 32, 32], [320, 640, 3, 3], [1, 640, 32, 32]]",1,5736960,920,1,640,1024,2880,0,14050,5652480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3818,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,268.6742775800712,374.6820090080071,0.9610887333307264,0.3122350075066726,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +70,Conv2d-GroupNorm70,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm70XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm70XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +71,Conv2d70Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d70Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d70Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +72,SkipConnection-Einsum67,"XlaEinsum(a=1x32x32x320,b=320x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum67einsum,MXU,1,Compute,1860,1860,1844,0,0,0,0,0,0,0,0,1860,457,0,0,2375680,"DT_BFLOAT16:[1,32,32,320],DT_BFLOAT16:[320,640]","[DT_BFLOAT16:(1,32,32,640)]",419430400,SkipConnectionEinsum67einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 320], [320, 640], [1, 32, 32, 640]]",1,2375680,120,1,1024,640,320,0,1860,2375680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,593,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,225.50021505376344,1189.5292548723119,0.8066485485840326,0.9912743790602598,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +73,SpatialTransformer-Input_GroupNorm73,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm73XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm73XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +74,SpatialTransformer-Proj_in74,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin74einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin74einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +75,BasicTransformerBlock-Input_layernorm75,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm75XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm75XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-Q-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Q76MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76Q76MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-K-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76K76MatMulK,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76K76MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-V-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76V76MatMulV,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76V76MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +77,SelfAttention76-FlashAttention-77,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention76FlashAttention77FlashAttention,MXU,1,Compute,15666,15666,4070,0,0,0,0,0,0,0,0,15666,11701,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention76FlashAttention77FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,7801,15666,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.276770585982383,311.68214604876806,0.06895593873763159,0.2597351217073067,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +78,SelfAttention76-Attention_output-78,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +79,SelfAttention76-Attention_layernorm-79,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention76Attentionlayernorm79YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention76Attentionlayernorm79YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +80,GatedSelfAttention-Linear80,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear80XLinearcontext,MXU,1,Memory,781,488,781,0,0,0,0,0,0,0,0,488,114,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear80XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,488,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.06955185659411,1199.1141852892927,0.03602031771045856,0.9992618210744105,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-Q-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Q81MatMulQ,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80Q81MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-K-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80K81MatMulK,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80K81MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-V-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80V81MatMulV,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80V81MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +82,GatedSelfAttention-Attn80-FlashAttention-82,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn80FlashAttention82FlashAttention,MXU,1,Compute,19810,19810,4101,0,0,0,0,0,0,0,0,19810,12860,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn80FlashAttention82FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,7924,19810,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.483438263503281,248.4078481906234,0.05538661237803086,0.20700654015885284,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +83,GatedSelfAttention-Attn80-Attention_output-83,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +84,GatedSelfAttention-Attn80-Attention_layernorm-84,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,VPU,1,Memory,2051,1229,2051,0,0,0,0,0,0,0,0,0,1229,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1229,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5762262311067774,1199.6488231731473,0.5990109354321934,0.9997073526442894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +85,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +86,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +87,GatedSelfAttention-FFN80Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +88,BasicTransformerBlock-Fuser_output_layernorm88,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-Q-89,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Q89MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention89Q89MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-K-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89K89MatMulK,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89K89MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-V-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89V89MatMulV,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89V89MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +90,CrossAttention89-FlashAttention-90,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention89FlashAttention90FlashAttention,MXU,1,Compute,7864,7864,3052,0,0,0,0,0,0,0,0,7864,5850,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention89FlashAttention90FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,3900,7864,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.200781281790437,465.68023588504576,0.06868411344504936,0.38806686323753814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +91,CrossAttention89-Attention_output-91,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +92,CrossAttention89-Attention_layernorm-92,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention89Attentionlayernorm92YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention89Attentionlayernorm92YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +93,BasicTransformerBlock-Attn_output_layernorm93,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +94,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +95,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +96,BasicTransformerBlock-FFN94Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +97,SpatialTransformer-Proj_out97,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout97einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout97einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +98,Time-Embed-MLP-Einsum98,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum98einsum,MXU,1,Memory,1275,793,1275,0,0,0,0,0,0,0,0,793,190,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum98einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,793,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2850196078431373,1199.5726940678614,0.004596710479063421,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +99,Conv2d-GroupNorm99,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm99XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm99XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +100,Conv2d99Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d99Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d99Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +101,Conv2d-GroupNorm101,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm101XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm101XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +102,Conv2d101Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d101Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d101Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +103,SpatialTransformer-Input_GroupNorm103,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm103XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm103XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +104,SpatialTransformer-Proj_in104,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin104einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin104einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +105,BasicTransformerBlock-Input_layernorm105,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm105XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm105XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-Q-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Q106MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106Q106MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-K-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106K106MatMulK,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106K106MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-V-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106V106MatMulV,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106V106MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +107,SelfAttention106-FlashAttention-107,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention106FlashAttention107FlashAttention,MXU,1,Compute,15666,15666,4070,0,0,0,0,0,0,0,0,15666,11701,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention106FlashAttention107FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,7801,15666,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.276770585982383,311.68214604876806,0.06895593873763159,0.2597351217073067,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +108,SelfAttention106-Attention_output-108,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +109,SelfAttention106-Attention_layernorm-109,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention106Attentionlayernorm109YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention106Attentionlayernorm109YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +110,GatedSelfAttention-Linear110,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear110XLinearcontext,MXU,1,Memory,781,488,781,0,0,0,0,0,0,0,0,488,114,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear110XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,488,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.06955185659411,1199.1141852892927,0.03602031771045856,0.9992618210744105,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-Q-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Q111MatMulQ,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110Q111MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-K-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110K111MatMulK,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110K111MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-V-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110V111MatMulV,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110V111MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,GatedSelfAttention-Attn110-FlashAttention-112,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn110FlashAttention112FlashAttention,MXU,1,Compute,19810,19810,4101,0,0,0,0,0,0,0,0,19810,12860,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn110FlashAttention112FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,7924,19810,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.483438263503281,248.4078481906234,0.05538661237803086,0.20700654015885284,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +113,GatedSelfAttention-Attn110-Attention_output-113,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +114,GatedSelfAttention-Attn110-Attention_layernorm-114,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,VPU,1,Memory,2051,1229,2051,0,0,0,0,0,0,0,0,0,1229,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1229,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5762262311067774,1199.6488231731473,0.5990109354321934,0.9997073526442894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +115,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +116,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +117,GatedSelfAttention-FFN110Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +118,BasicTransformerBlock-Fuser_output_layernorm118,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-Q-119,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Q119MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention119Q119MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-K-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119K119MatMulK,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119K119MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-V-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119V119MatMulV,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119V119MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +120,CrossAttention119-FlashAttention-120,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention119FlashAttention120FlashAttention,MXU,1,Compute,7864,7864,3052,0,0,0,0,0,0,0,0,7864,5850,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention119FlashAttention120FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,3900,7864,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.200781281790437,465.68023588504576,0.06868411344504936,0.38806686323753814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +121,CrossAttention119-Attention_output-121,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +122,CrossAttention119-Attention_layernorm-122,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention119Attentionlayernorm122YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention119Attentionlayernorm122YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +123,BasicTransformerBlock-Attn_output_layernorm123,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +124,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +125,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +126,BasicTransformerBlock-FFN124Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +127,SpatialTransformer-Proj_out127,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout127einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout127einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +128,Downsample-Conv2d128Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d128Conv2dconv2d,MXU,1,Memory,6994,6888,6994,0,0,0,0,0,0,0,0,6888,1714,0,0,9011200,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,16,16)]",1887436800,DownsampleConv2d128Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 16, 16]]",1,9180160,450,1,640,256,5760,0,6888,9011200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,269.86514154989993,1199.9333692271948,0.9653486347795758,0.9999444743559956,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +129,Time-Embed-MLP-Einsum129,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum129einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum129einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +130,Conv2d-GroupNorm130,"GroupNorm(x=1x640x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm130XnormGroupNormX,VPU,1,Memory,509,305,509,0,0,0,0,0,0,0,0,0,305,0,0,655360,"DT_BFLOAT16:[1,640,16,16]","[DT_BFLOAT16:(1,640,16,16)]",1310720,Conv2dGroupNorm130XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,305,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +131,Conv2d130Conv2d,"Conv2D(a=1x640x16x16,b=640x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d130Conv2dconv2d,MXU,1,Compute,13745,13745,12208,0,0,0,0,0,0,0,0,13745,3428,0,0,15728640,"DT_BFLOAT16:[1,640,16,16],DT_BFLOAT16:[640,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",3774873600,Conv2d130Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 640, 16, 16], [640, 1280, 3, 3], [1, 1280, 16, 16]]",1,15815680,900,1,1280,256,5760,0,13745,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.63612950163696,1065.7284467078937,0.9824151839430124,0.8881070389232447,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +132,Conv2d-GroupNorm132,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm132XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm132XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +133,Conv2d132Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d132Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d132Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +134,SkipConnection-Einsum129,"XlaEinsum(a=1x16x16x640,b=640x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum129einsum,MXU,1,Memory,2035,1555,2035,0,0,0,0,0,0,0,0,1555,380,0,0,2621440,"DT_BFLOAT16:[1,16,16,640],DT_BFLOAT16:[640,1280]","[DT_BFLOAT16:(1,16,16,1280)]",419430400,SkipConnectionEinsum129einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 640], [640, 1280], [1, 16, 16, 1280]]",1,2621440,100,1,256,1280,640,0,1555,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,530,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,206.10830466830467,1199.708230958231,0.7372807372807374,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +135,SpatialTransformer-Input_GroupNorm135,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm135XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm135XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +136,SpatialTransformer-Proj_in136,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin136einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin136einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +137,BasicTransformerBlock-Input_layernorm137,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm137XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm137XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-Q-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Q138MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138Q138MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-K-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138K138MatMulK,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138K138MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-V-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138V138MatMulV,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138V138MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +139,SelfAttention138-FlashAttention-139,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention138FlashAttention139FlashAttention,MXU,1,Memory,2035,2012,2035,0,0,0,0,0,0,0,0,2012,973,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention138FlashAttention139FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,487,2012,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.274873710073711,1199.708230958231,0.033177633177633184,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +140,SelfAttention138-Attention_output-140,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +141,SelfAttention138-Attention_layernorm-141,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention138Attentionlayernorm141YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention138Attentionlayernorm141YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +142,GatedSelfAttention-Linear142,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear142XLinearcontext,MXU,1,Memory,1552,945,1552,0,0,0,0,0,0,0,0,945,228,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear142XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,945,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.13443298969072,1199.4666659954896,0.03625240738642801,0.9995555549962414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-Q-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Q143MatMulQ,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142Q143MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-K-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142K143MatMulK,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142K143MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-V-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142V143MatMulV,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142V143MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +144,GatedSelfAttention-Attn142-FlashAttention-144,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn142FlashAttention144FlashAttention,MXU,1,Compute,4450,4450,2099,0,0,0,0,0,0,0,0,4450,1615,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn142FlashAttention144FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,519,4450,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1258,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.510662471910113,565.7753247893259,0.0161353253488085,0.4714794373244382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +145,GatedSelfAttention-Attn142-Attention_output-145,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +146,GatedSelfAttention-Attn142-Attention_layernorm-146,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,VPU,1,Memory,1050,629,1050,0,0,0,0,0,0,0,0,0,629,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,629,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5746285714285713,1198.9048549107142,0.5986394557823129,0.9990873790922619,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +147,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +148,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +149,GatedSelfAttention-FFN142Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +150,BasicTransformerBlock-Fuser_output_layernorm150,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-Q-151,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Q151MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention151Q151MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-K-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151K151MatMulK,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151K151MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-V-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151V151MatMulV,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151V151MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +152,CrossAttention151-FlashAttention-152,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention151FlashAttention152FlashAttention,MXU,1,Compute,3962,3962,3052,0,0,0,0,0,0,0,0,3962,1949,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention151FlashAttention152FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,975,3962,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.52769712266532,924.3082723372034,0.03408202095733646,0.7702568936143362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +153,CrossAttention151-Attention_output-153,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +154,CrossAttention151-Attention_layernorm-154,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention151Attentionlayernorm154YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention151Attentionlayernorm154YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +155,BasicTransformerBlock-Attn_output_layernorm155,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +156,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +157,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +158,BasicTransformerBlock-FFN156Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +159,SpatialTransformer-Proj_out159,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout159einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout159einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +160,Time-Embed-MLP-Einsum160,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum160einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum160einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +161,Conv2d-GroupNorm161,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm161XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm161XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +162,Conv2d161Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d161Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d161Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +163,Conv2d-GroupNorm163,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm163XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm163XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +164,Conv2d163Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d163Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d163Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +165,SpatialTransformer-Input_GroupNorm165,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm165XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm165XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +166,SpatialTransformer-Proj_in166,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin166einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin166einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +167,BasicTransformerBlock-Input_layernorm167,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm167XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm167XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-Q-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Q168MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168Q168MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-K-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168K168MatMulK,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168K168MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-V-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168V168MatMulV,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168V168MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +169,SelfAttention168-FlashAttention-169,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention168FlashAttention169FlashAttention,MXU,1,Memory,2035,2012,2035,0,0,0,0,0,0,0,0,2012,973,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention168FlashAttention169FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,487,2012,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.274873710073711,1199.708230958231,0.033177633177633184,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +170,SelfAttention168-Attention_output-170,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +171,SelfAttention168-Attention_layernorm-171,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention168Attentionlayernorm171YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention168Attentionlayernorm171YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +172,GatedSelfAttention-Linear172,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear172XLinearcontext,MXU,1,Memory,1552,945,1552,0,0,0,0,0,0,0,0,945,228,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear172XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,945,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.13443298969072,1199.4666659954896,0.03625240738642801,0.9995555549962414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-Q-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Q173MatMulQ,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172Q173MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-K-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172K173MatMulK,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172K173MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-V-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172V173MatMulV,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172V173MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +174,GatedSelfAttention-Attn172-FlashAttention-174,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn172FlashAttention174FlashAttention,MXU,1,Compute,4450,4450,2099,0,0,0,0,0,0,0,0,4450,1615,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn172FlashAttention174FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,519,4450,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1258,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.510662471910113,565.7753247893259,0.0161353253488085,0.4714794373244382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +175,GatedSelfAttention-Attn172-Attention_output-175,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +176,GatedSelfAttention-Attn172-Attention_layernorm-176,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,VPU,1,Memory,1050,629,1050,0,0,0,0,0,0,0,0,0,629,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,629,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5746285714285713,1198.9048549107142,0.5986394557823129,0.9990873790922619,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +177,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +178,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +179,GatedSelfAttention-FFN172Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +180,BasicTransformerBlock-Fuser_output_layernorm180,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-Q-181,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Q181MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention181Q181MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-K-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181K181MatMulK,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181K181MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-V-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181V181MatMulV,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181V181MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +182,CrossAttention181-FlashAttention-182,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention181FlashAttention182FlashAttention,MXU,1,Compute,3962,3962,3052,0,0,0,0,0,0,0,0,3962,1949,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention181FlashAttention182FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,975,3962,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.52769712266532,924.3082723372034,0.03408202095733646,0.7702568936143362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +183,CrossAttention181-Attention_output-183,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +184,CrossAttention181-Attention_layernorm-184,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention181Attentionlayernorm184YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention181Attentionlayernorm184YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +185,BasicTransformerBlock-Attn_output_layernorm185,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +186,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +187,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +188,BasicTransformerBlock-FFN186Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +189,SpatialTransformer-Proj_out189,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout189einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout189einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +190,Downsample-Conv2d190Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d190Conv2dconv2d,MXU,1,Memory,23524,13745,23524,0,0,0,0,0,0,0,0,13745,3428,0,0,30310400,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,DownsampleConv2d190Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,24420352,900,1,1280,64,11520,0,13745,30310400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5077,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,80.234517939126,1199.998289645681,0.2870110674905778,0.9999985747047343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +191,Time-Embed-MLP-Einsum191,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum191einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum191einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +192,Conv2d-GroupNorm192,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm192XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm192XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +193,Conv2d192Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d192Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d192Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +194,Conv2d-GroupNorm194,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm194XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm194XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +195,Conv2d194Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d194Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d194Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +196,Time-Embed-MLP-Einsum196,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum196einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum196einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +197,Conv2d-GroupNorm197,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm197XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm197XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +198,Conv2d197Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d197Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d197Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +199,Conv2d-GroupNorm199,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm199XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm199XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +200,Conv2d199Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d199Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d199Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +201,Time-Embed-MLP-Einsum201,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum201einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum201einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +202,Conv2d-GroupNorm202,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm202XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm202XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +203,Conv2d202Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d202Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d202Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +204,Conv2d-GroupNorm204,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm204XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm204XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +205,Conv2d204Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d204Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d204Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +206,SpatialTransformer-Input_GroupNorm206,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm206XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,SpatialTransformerInputGroupNorm206XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +207,SpatialTransformer-Proj_in207,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin207einsum,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjin207einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +208,BasicTransformerBlock-Input_layernorm208,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm208XnormLayerNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockInputlayernorm208XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-Q-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Q209MatMulQ,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209Q209MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-K-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209K209MatMulK,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209K209MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-V-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209V209MatMulV,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209V209MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +210,SelfAttention209-FlashAttention-210,"FlashAttention(q=1x64x8x160,k=1x64x8x160,v=1x64x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention209FlashAttention210FlashAttention,MXU,1,Memory,8647,550,8647,0,0,0,0,0,0,0,0,550,150,0,0,11141120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160]","[DT_BFLOAT16:(1,64,8,8)]",1179648,SelfAttention209FlashAttention210FlashAttention,FlashAttention,0,[],FlashAttention,,"[64, 64]",,11141120,32,8,64,64,160,30,550,11141120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,740,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.13642280559731698,1199.9510307042906,0.00048800511388692264,0.9999591922535754,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +211,SelfAttention209-Attention_output-211,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +212,SelfAttention209-Attention_layernorm-212,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention209Attentionlayernorm212YnormLayerNormy,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,SelfAttention209Attentionlayernorm212YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +213,GatedSelfAttention-Linear213,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear213XLinearcontext,MXU,1,Memory,1552,945,1552,0,0,0,0,0,0,0,0,945,228,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear213XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,945,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.13443298969072,1199.4666659954896,0.03625240738642801,0.9995555549962414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-Q-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Q214MatMulQ,MXU,1,Memory,2830,1555,2830,0,0,0,0,0,0,0,0,1555,380,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213Q214MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,1555,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,586,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,83.36734982332156,1199.6751118043287,0.298217683376694,0.9997292598369406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-K-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213K214MatMulK,MXU,1,Memory,2830,1555,2830,0,0,0,0,0,0,0,0,1555,380,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213K214MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,1555,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,586,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,83.36734982332156,1199.6751118043287,0.298217683376694,0.9997292598369406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-V-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213V214MatMulV,MXU,1,Memory,2830,1555,2830,0,0,0,0,0,0,0,0,1555,380,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213V214MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,1555,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,586,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,83.36734982332156,1199.6751118043287,0.298217683376694,0.9997292598369406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +215,GatedSelfAttention-Attn213-FlashAttention-215,"FlashAttention(q=1x72x8x160,k=1x72x8x160,v=1x72x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn213FlashAttention215FlashAttention,MXU,1,Memory,10872,550,10872,0,0,0,0,0,0,0,0,550,159,0,0,14008320,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160]","[DT_BFLOAT16:(1,72,8,8)]",1492992,GatedSelfAttentionAttn213FlashAttention215FlashAttention,FlashAttention,0,[],FlashAttention,,"[72, 72]",,14008320,32,8,72,72,160,39,550,14008320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.13732450331125828,1199.9875504449503,0.0004912306236809549,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +216,GatedSelfAttention-Attn213-Attention_output-216,"XlaEinsum(a=1x72x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,MXU,1,Memory,2830,1555,2830,0,0,0,0,0,0,0,0,1555,380,0,0,3645440,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,72,1280)]",235929600,GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 72, 8, 160], [8, 160, 1280], [1, 72, 1280]]",1,2953216,100,1,72,1280,1280,0,1555,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,586,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,83.36734982332156,1199.6751118043287,0.298217683376694,0.9997292598369406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +217,GatedSelfAttention-Attn213-Attention_layernorm-217,"LayerNorm(x=1x72x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,VPU,1,Memory,500,172,500,0,0,0,0,0,0,0,0,0,172,0,0,368640,"DT_BFLOAT16:[1,72,1280]","[DT_BFLOAT16:(1,72,1280)]",737280,GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,172,368640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,77,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.47456,686.6455078125,0.3428571428571429,0.57220458984375,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +218,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,10809,6126,10809,0,0,0,0,0,0,0,0,6126,1523,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,6126,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,77.60762327689888,1199.9232771879915,0.2776142659573134,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +219,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,10809,6126,10809,0,0,0,0,0,0,0,0,6126,1523,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,6126,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,77.60762327689888,1199.9232771879915,0.2776142659573134,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +220,GatedSelfAttention-FFN213Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.0380952380952381,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +221,BasicTransformerBlock-Fuser_output_layernorm221,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-Q-222,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Q222MatMulQ,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,CrossAttention222Q222MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-K-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222K222MatMulK,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222K222MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-V-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222V222MatMulV,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222V222MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +223,CrossAttention222-FlashAttention-223,"FlashAttention(q=1x64x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention222FlashAttention223FlashAttention,MXU,1,Memory,67393,2012,67393,0,0,0,0,0,0,0,0,2012,729,0,0,86835200,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,64,8,8)]",9437184,CrossAttention222FlashAttention223FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 64]",,86835200,128,8,512,64,160,243,2012,86835200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14003211015980888,1199.9997333736442,0.0005009161449741332,0.9999997778113702,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,CrossAttention222-Attention_output-224,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +225,CrossAttention222-Attention_layernorm-225,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention222Attentionlayernorm225YnormLayerNormy,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,CrossAttention222Attentionlayernorm225YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +226,BasicTransformerBlock-Attn_output_layernorm226,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +227,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,10809,6126,10809,0,0,0,0,0,0,0,0,6126,1523,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,6126,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,77.60762327689888,1199.9232771879915,0.2776142659573134,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +228,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,10809,6126,10809,0,0,0,0,0,0,0,0,6126,1523,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,6126,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,77.60762327689888,1199.9232771879915,0.2776142659573134,0.9999360643233263,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +229,BasicTransformerBlock-FFN227Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.0380952380952381,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +230,SpatialTransformer-Proj_out230,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout230einsum,MXU,1,Memory,2798,1555,2798,0,0,0,0,0,0,0,0,1555,380,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjout230einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,1555,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,583,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,74.9518227305218,1199.7618276447463,0.26811406368232676,0.9998015230372885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +231,Time-Embed-MLP-Einsum231,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum231einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum231einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +232,Conv2d-GroupNorm232,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm232XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm232XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +233,Conv2d232Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d232Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d232Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +234,Conv2d-GroupNorm234,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm234XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm234XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +235,Conv2d234Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d234Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d234Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +236,Time-Embed-MLP-Einsum236,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum236einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum236einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +237,Conv2d-GroupNorm237,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm237XnormGroupNormX,VPU,1,Memory,509,305,509,0,0,0,0,0,0,0,0,0,305,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm237XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,305,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +238,Conv2d237Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d237Conv2dconv2d,MXU,1,Memory,46158,27460,46158,0,0,0,0,0,0,0,0,27460,6857,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d237Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,27460,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.7815676589107,1199.9957601472117,0.2925450995124725,0.9999964667893431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +239,Conv2d-GroupNorm239,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm239XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm239XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +240,Conv2d239Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d239Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d239Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +241,SkipConnection-Einsum236,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum236einsum,MXU,1,Memory,5468,3079,5468,0,0,0,0,0,0,0,0,3079,761,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum236einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,3079,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,76.70636430138991,1199.941349099305,0.27439032559734833,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +242,Time-Embed-MLP-Einsum242,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum242einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum242einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +243,Conv2d-GroupNorm243,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm243XnormGroupNormX,VPU,1,Memory,509,305,509,0,0,0,0,0,0,0,0,0,305,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm243XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,305,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +244,Conv2d243Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d243Conv2dconv2d,MXU,1,Memory,46158,27460,46158,0,0,0,0,0,0,0,0,27460,6857,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d243Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,27460,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.7815676589107,1199.9957601472117,0.2925450995124725,0.9999964667893431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +245,Conv2d-GroupNorm245,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm245XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm245XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +246,Conv2d245Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d245Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d245Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +247,SkipConnection-Einsum242,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum242einsum,MXU,1,Memory,5468,3079,5468,0,0,0,0,0,0,0,0,3079,761,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum242einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,3079,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,76.70636430138991,1199.941349099305,0.27439032559734833,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +248,Time-Embed-MLP-Einsum248,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum248einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum248einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +249,Conv2d-GroupNorm249,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm249XnormGroupNormX,VPU,1,Memory,509,305,509,0,0,0,0,0,0,0,0,0,305,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm249XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,305,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +250,Conv2d249Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d249Conv2dconv2d,MXU,1,Memory,46158,27460,46158,0,0,0,0,0,0,0,0,27460,6857,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d249Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,27460,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.7815676589107,1199.9957601472117,0.2925450995124725,0.9999964667893431,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +251,Conv2d-GroupNorm251,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm251XnormGroupNormX,VPU,1,Memory,500,153,500,0,0,0,0,0,0,0,0,0,153,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm251XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,153,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.3047619047619048,0.5086263020833334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +252,Conv2d251Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d251Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d251Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +253,SkipConnection-Einsum248,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum248einsum,MXU,1,Memory,5468,3079,5468,0,0,0,0,0,0,0,0,3079,761,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum248einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,3079,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,76.70636430138991,1199.941349099305,0.27439032559734833,0.9999511242494208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample254,"Upsample(a=1x1280x8x8,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample254Upsample,VPU,1,Memory,636,0,636,0,0,0,0,0,0,0,0,0,0,0,0,819200,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,16,16)]",0,Upsample254Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,819200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1199.5903351022012,0.0,0.9996586125851676,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample-Conv2d254Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d254Conv2dconv2d,MXU,1,Memory,23143,13745,23143,0,0,0,0,0,0,0,0,13745,3428,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,UpsampleConv2d254Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,13745,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5050,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,81.55540768266863,1199.9739054465713,0.29173609089782454,0.9999782545388094,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +255,Time-Embed-MLP-Einsum255,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum255einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum255einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +256,Conv2d-GroupNorm256,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm256XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm256XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +257,Conv2d256Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d256Conv2dconv2d,MXU,1,Compute,54888,54888,47303,0,0,0,0,0,0,0,0,54888,13714,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d256Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 16, 16], [2560, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,54888,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.0964582422387,1034.15492115763,0.9840618498248582,0.8617957676313583,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +258,Conv2d-GroupNorm258,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm258XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm258XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +259,Conv2d258Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d258Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d258Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +260,SkipConnection-Einsum255,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum255einsum,MXU,1,Memory,6613,6126,6613,0,0,0,0,0,0,0,0,6126,1523,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum255einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,6126,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,253.7005292605474,1199.8442934371692,0.9075253593626497,0.9998702445309743,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +261,SpatialTransformer-Input_GroupNorm261,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm261XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm261XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +262,SpatialTransformer-Proj_in262,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin262einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin262einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +263,BasicTransformerBlock-Input_layernorm263,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm263XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm263XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-Q-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Q264MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264Q264MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-K-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264K264MatMulK,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264K264MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-V-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264V264MatMulV,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264V264MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +265,SelfAttention264-FlashAttention-265,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention264FlashAttention265FlashAttention,MXU,1,Memory,2035,2012,2035,0,0,0,0,0,0,0,0,2012,973,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention264FlashAttention265FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,487,2012,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.274873710073711,1199.708230958231,0.033177633177633184,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +266,SelfAttention264-Attention_output-266,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +267,SelfAttention264-Attention_layernorm-267,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention264Attentionlayernorm267YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention264Attentionlayernorm267YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +268,GatedSelfAttention-Linear268,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear268XLinearcontext,MXU,1,Memory,1552,945,1552,0,0,0,0,0,0,0,0,945,228,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear268XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,945,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.13443298969072,1199.4666659954896,0.03625240738642801,0.9995555549962414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-Q-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Q269MatMulQ,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268Q269MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-K-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268K269MatMulK,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268K269MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-V-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268V269MatMulV,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268V269MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +270,GatedSelfAttention-Attn268-FlashAttention-270,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn268FlashAttention270FlashAttention,MXU,1,Compute,4450,4450,2099,0,0,0,0,0,0,0,0,4450,1615,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn268FlashAttention270FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,519,4450,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1258,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.510662471910113,565.7753247893259,0.0161353253488085,0.4714794373244382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +271,GatedSelfAttention-Attn268-Attention_output-271,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +272,GatedSelfAttention-Attn268-Attention_layernorm-272,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,VPU,1,Memory,1050,629,1050,0,0,0,0,0,0,0,0,0,629,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,629,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5746285714285713,1198.9048549107142,0.5986394557823129,0.9990873790922619,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +273,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +274,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +275,GatedSelfAttention-FFN268Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +276,BasicTransformerBlock-Fuser_output_layernorm276,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-Q-277,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Q277MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention277Q277MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-K-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277K277MatMulK,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277K277MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-V-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277V277MatMulV,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277V277MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +278,CrossAttention277-FlashAttention-278,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention277FlashAttention278FlashAttention,MXU,1,Compute,3962,3962,3052,0,0,0,0,0,0,0,0,3962,1949,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention277FlashAttention278FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,975,3962,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.52769712266532,924.3082723372034,0.03408202095733646,0.7702568936143362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +279,CrossAttention277-Attention_output-279,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +280,CrossAttention277-Attention_layernorm-280,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention277Attentionlayernorm280YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention277Attentionlayernorm280YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +281,BasicTransformerBlock-Attn_output_layernorm281,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +282,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +283,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +284,BasicTransformerBlock-FFN282Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +285,SpatialTransformer-Proj_out285,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout285einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout285einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +286,Time-Embed-MLP-Einsum286,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum286einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum286einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +287,Conv2d-GroupNorm287,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm287XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm287XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +288,Conv2d287Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d287Conv2dconv2d,MXU,1,Compute,54888,54888,47303,0,0,0,0,0,0,0,0,54888,13714,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d287Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 16, 16], [2560, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,54888,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.0964582422387,1034.15492115763,0.9840618498248582,0.8617957676313583,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +289,Conv2d-GroupNorm289,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm289XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm289XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +290,Conv2d289Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d289Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d289Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +291,SkipConnection-Einsum286,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum286einsum,MXU,1,Memory,6613,6126,6613,0,0,0,0,0,0,0,0,6126,1523,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum286einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,6126,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1992,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,253.7005292605474,1199.8442934371692,0.9075253593626497,0.9998702445309743,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +292,SpatialTransformer-Input_GroupNorm292,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm292XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm292XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +293,SpatialTransformer-Proj_in293,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin293einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin293einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +294,BasicTransformerBlock-Input_layernorm294,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm294XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm294XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-Q-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Q295MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295Q295MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-K-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295K295MatMulK,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295K295MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-V-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295V295MatMulV,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295V295MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +296,SelfAttention295-FlashAttention-296,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention295FlashAttention296FlashAttention,MXU,1,Memory,2035,2012,2035,0,0,0,0,0,0,0,0,2012,973,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention295FlashAttention296FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,487,2012,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.274873710073711,1199.708230958231,0.033177633177633184,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +297,SelfAttention295-Attention_output-297,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +298,SelfAttention295-Attention_layernorm-298,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention295Attentionlayernorm298YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention295Attentionlayernorm298YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +299,GatedSelfAttention-Linear299,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear299XLinearcontext,MXU,1,Memory,1552,945,1552,0,0,0,0,0,0,0,0,945,228,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear299XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,945,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.13443298969072,1199.4666659954896,0.03625240738642801,0.9995555549962414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-Q-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Q300MatMulQ,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299Q300MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-K-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299K300MatMulK,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299K300MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-V-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299V300MatMulV,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299V300MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +301,GatedSelfAttention-Attn299-FlashAttention-301,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn299FlashAttention301FlashAttention,MXU,1,Compute,4450,4450,2099,0,0,0,0,0,0,0,0,4450,1615,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn299FlashAttention301FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,519,4450,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1258,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.510662471910113,565.7753247893259,0.0161353253488085,0.4714794373244382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +302,GatedSelfAttention-Attn299-Attention_output-302,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +303,GatedSelfAttention-Attn299-Attention_layernorm-303,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,VPU,1,Memory,1050,629,1050,0,0,0,0,0,0,0,0,0,629,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,629,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5746285714285713,1198.9048549107142,0.5986394557823129,0.9990873790922619,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +304,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +305,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +306,GatedSelfAttention-FFN299Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +307,BasicTransformerBlock-Fuser_output_layernorm307,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-Q-308,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Q308MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention308Q308MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-K-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308K308MatMulK,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308K308MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-V-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308V308MatMulV,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308V308MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +309,CrossAttention308-FlashAttention-309,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention308FlashAttention309FlashAttention,MXU,1,Compute,3962,3962,3052,0,0,0,0,0,0,0,0,3962,1949,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention308FlashAttention309FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,975,3962,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.52769712266532,924.3082723372034,0.03408202095733646,0.7702568936143362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +310,CrossAttention308-Attention_output-310,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +311,CrossAttention308-Attention_layernorm-311,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention308Attentionlayernorm311YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention308Attentionlayernorm311YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +312,BasicTransformerBlock-Attn_output_layernorm312,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +313,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +314,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +315,BasicTransformerBlock-FFN313Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +316,SpatialTransformer-Proj_out316,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout316einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout316einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +317,Time-Embed-MLP-Einsum317,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum317einsum,MXU,1,Memory,2548,1555,2548,0,0,0,0,0,0,0,0,1555,380,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum317einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,1555,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,566,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.286028257456829,1199.578565181331,0.004600318572061116,0.9996488043177758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +318,Conv2d-GroupNorm318,"GroupNorm(x=1x1920x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm318XnormGroupNormX,VPU,1,Memory,1526,915,1526,0,0,0,0,0,0,0,0,0,915,0,0,1966080,"DT_BFLOAT16:[1,1920,16,16]","[DT_BFLOAT16:(1,1920,16,16)]",3932160,Conv2dGroupNorm318XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,915,1966080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,335,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576775884665793,1199.9047755570118,0.5991387380640331,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +319,Conv2d318Conv2d,"Conv2D(a=1x1920x16x16,b=1920x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d318Conv2dconv2d,MXU,1,Compute,41174,41174,35604,0,0,0,0,0,0,0,0,41174,10285,0,0,45875200,"DT_BFLOAT16:[1,1920,16,16],DT_BFLOAT16:[1920,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",11324620800,Conv2d318Conv2dconv2d,Conv2D,44236800,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 16, 16], [1920, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,2700,1,1280,256,17280,0,41174,45875200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12777,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.04300772332056,1037.6599158449508,0.9838706491934259,0.864716596537459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +320,Conv2d-GroupNorm320,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm320XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm320XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +321,Conv2d320Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d320Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d320Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +322,SkipConnection-Einsum317,"XlaEinsum(a=1x16x16x1920,b=1920x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum317einsum,MXU,1,Memory,5087,4602,5087,0,0,0,0,0,0,0,0,4602,1142,0,0,6553600,"DT_BFLOAT16:[1,16,16,1920],DT_BFLOAT16:[1920,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1258291200,SkipConnectionEinsum317einsum,Einsum,4915200,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 1920], [1920, 1280], [1, 16, 16, 1280]]",1,3801088,300,1,256,1280,1920,0,4602,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1505,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,247.354275604482,1199.826149990171,0.8848238453113626,0.9998551249918092,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +323,SpatialTransformer-Input_GroupNorm323,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm323XnormGroupNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm323XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +324,SpatialTransformer-Proj_in324,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin324einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin324einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +325,BasicTransformerBlock-Input_layernorm325,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm325XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm325XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-Q-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Q326MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326Q326MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-K-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326K326MatMulK,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326K326MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-V-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326V326MatMulV,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326V326MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +327,SelfAttention326-FlashAttention-327,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention326FlashAttention327FlashAttention,MXU,1,Memory,2035,2012,2035,0,0,0,0,0,0,0,0,2012,973,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention326FlashAttention327FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,487,2012,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.274873710073711,1199.708230958231,0.033177633177633184,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +328,SelfAttention326-Attention_output-328,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +329,SelfAttention326-Attention_layernorm-329,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention326Attentionlayernorm329YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention326Attentionlayernorm329YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +330,GatedSelfAttention-Linear330,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear330XLinearcontext,MXU,1,Memory,1552,945,1552,0,0,0,0,0,0,0,0,945,228,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear330XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,945,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,344,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.13443298969072,1199.4666659954896,0.03625240738642801,0.9995555549962414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-Q-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Q331MatMulQ,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330Q331MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-K-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330K331MatMulK,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330K331MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-V-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330V331MatMulV,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330V331MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +332,GatedSelfAttention-Attn330-FlashAttention-332,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn330FlashAttention332FlashAttention,MXU,1,Compute,4450,4450,2099,0,0,0,0,0,0,0,0,4450,1615,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn330FlashAttention332FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,519,4450,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1258,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.510662471910113,565.7753247893259,0.0161353253488085,0.4714794373244382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +333,GatedSelfAttention-Attn330-Attention_output-333,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,MXU,1,Compute,4602,4602,3593,0,0,0,0,0,0,0,0,4602,1142,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,4602,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1401,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.97809647979142,936.6814233281725,0.6724262265331368,0.7805678527734771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +334,GatedSelfAttention-Attn330-Attention_layernorm-334,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,VPU,1,Memory,1050,629,1050,0,0,0,0,0,0,0,0,0,629,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,629,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5746285714285713,1198.9048549107142,0.5986394557823129,0.9990873790922619,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +335,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +336,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +337,GatedSelfAttention-FFN330Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +338,BasicTransformerBlock-Fuser_output_layernorm338,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-Q-339,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Q339MatMulQ,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention339Q339MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-K-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339K339MatMulK,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339K339MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-V-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339V339MatMulV,MXU,1,Compute,3688,3688,3154,0,0,0,0,0,0,0,0,3688,914,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339V339MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,3688,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.94819956616055,1026.079090970716,0.9763772019737315,0.8550659091422633,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +340,CrossAttention339-FlashAttention-340,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention339FlashAttention340FlashAttention,MXU,1,Compute,3962,3962,3052,0,0,0,0,0,0,0,0,3962,1949,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention339FlashAttention340FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,975,3962,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,9.52769712266532,924.3082723372034,0.03408202095733646,0.7702568936143362,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +341,CrossAttention339-Attention_output-341,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +342,CrossAttention339-Attention_layernorm-342,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention339Attentionlayernorm342YnormLayerNormy,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention339Attentionlayernorm342YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +343,BasicTransformerBlock-Attn_output_layernorm343,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,VPU,1,Memory,1018,610,1018,0,0,0,0,0,0,0,0,0,610,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,610,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.575088408644401,1199.1189833005894,0.598746374777809,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +344,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +345,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,12716,12221,12716,0,0,0,0,0,0,0,0,12221,3047,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,12221,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,263.87568417741426,1199.967683430324,0.9439234352729162,0.9999730695252701,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +346,BasicTransformerBlock-FFN344Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1018,77,1018,0,0,0,0,0,0,0,0,0,77,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,77,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32188605108055013,1199.1189833005894,0.07484329684722613,0.9992658194171579,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +347,SpatialTransformer-Proj_out347,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout347einsum,MXU,1,Memory,3561,3079,3561,0,0,0,0,0,0,0,0,3079,761,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout347einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,3079,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,235.5688851446223,1199.7924564729008,0.8426657120844148,0.9998270470607507,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample348,"Upsample(a=1x1280x16x16,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample348Upsample,VPU,1,Memory,2544,0,2544,0,0,0,0,0,0,0,0,0,0,0,0,3276800,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,32,32)]",0,Upsample348Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,3276800,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1199.5903351022012,0.0,0.9996586125851676,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample-Conv2d348Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d348Conv2dconv2d,MXU,1,Compute,27460,27460,23906,0,0,0,0,0,0,0,0,27460,6857,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,UpsampleConv2d348Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,27460,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8532,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,1044.665820739257,0.9834884707426589,0.8705548506160475,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +349,Time-Embed-MLP-Einsum349,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum349einsum,MXU,1,Memory,1275,793,1275,0,0,0,0,0,0,0,0,793,190,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum349einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,793,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2850196078431373,1199.5726940678614,0.004596710479063421,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +350,Conv2d-GroupNorm350,"GroupNorm(x=1x1920x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm350XnormGroupNormX,VPU,1,Memory,6104,3658,6104,0,0,0,0,0,0,0,0,0,3658,0,0,7864320,"DT_BFLOAT16:[1,1920,32,32]","[DT_BFLOAT16:(1,1920,32,32)]",15728640,Conv2dGroupNorm350XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3658,7864320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1340,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576775884665793,1199.9047755570118,0.5991387380640331,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +351,Conv2d350Conv2d,"Conv2D(a=1x1920x32x32,b=1920x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d350Conv2dconv2d,MXU,1,Compute,82317,82317,21236,0,0,0,0,0,0,0,0,82317,20571,0,0,27361280,"DT_BFLOAT16:[1,1920,32,32],DT_BFLOAT16:[1920,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",22649241600,Conv2d350Conv2dconv2d,Conv2D,22118400,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 32, 32], [1920, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,5400,1,640,1024,17280,0,82317,27361280,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.14658697474397,309.56154542044777,0.9842411679213313,0.2579679545170398,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +352,Conv2d-GroupNorm352,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm352XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm352XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +353,Conv2d352Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d352Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d352Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +354,SkipConnection-Einsum349,"XlaEinsum(a=1x32x32x1920,b=1920x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum349einsum,MXU,1,Compute,9174,9174,5977,0,0,0,0,0,0,0,0,9174,2285,0,0,7700480,"DT_BFLOAT16:[1,32,32,1920],DT_BFLOAT16:[1920,640]","[DT_BFLOAT16:(1,32,32,640)]",2516582400,SkipConnectionEinsum349einsum,Einsum,2457600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1920], [1920, 640], [1, 32, 32, 640]]",1,4718592,600,1,1024,640,1920,0,9174,7700480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2710,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.3168083714846,781.7343426395247,0.981272923718967,0.6514452855329372,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +355,SpatialTransformer-Input_GroupNorm355,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm355XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm355XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +356,SpatialTransformer-Proj_in356,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin356einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin356einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +357,BasicTransformerBlock-Input_layernorm357,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm357XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm357XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-Q-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Q358MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358Q358MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-K-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358K358MatMulK,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358K358MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-V-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358V358MatMulV,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358V358MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +359,SelfAttention358-FlashAttention-359,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention358FlashAttention359FlashAttention,MXU,1,Compute,15666,15666,4070,0,0,0,0,0,0,0,0,15666,11701,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention358FlashAttention359FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,7801,15666,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.276770585982383,311.68214604876806,0.06895593873763159,0.2597351217073067,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +360,SelfAttention358-Attention_output-360,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +361,SelfAttention358-Attention_layernorm-361,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention358Attentionlayernorm361YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention358Attentionlayernorm361YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +362,GatedSelfAttention-Linear362,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear362XLinearcontext,MXU,1,Memory,781,488,781,0,0,0,0,0,0,0,0,488,114,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear362XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,488,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.06955185659411,1199.1141852892927,0.03602031771045856,0.9992618210744105,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-Q-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Q363MatMulQ,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362Q363MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-K-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362K363MatMulK,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362K363MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-V-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362V363MatMulV,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362V363MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +364,GatedSelfAttention-Attn362-FlashAttention-364,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn362FlashAttention364FlashAttention,MXU,1,Compute,19810,19810,4101,0,0,0,0,0,0,0,0,19810,12860,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn362FlashAttention364FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,7924,19810,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.483438263503281,248.4078481906234,0.05538661237803086,0.20700654015885284,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +365,GatedSelfAttention-Attn362-Attention_output-365,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +366,GatedSelfAttention-Attn362-Attention_layernorm-366,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,VPU,1,Memory,2051,1229,2051,0,0,0,0,0,0,0,0,0,1229,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1229,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5762262311067774,1199.6488231731473,0.5990109354321934,0.9997073526442894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +367,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +368,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +369,GatedSelfAttention-FFN362Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +370,BasicTransformerBlock-Fuser_output_layernorm370,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-Q-371,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Q371MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention371Q371MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-K-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371K371MatMulK,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371K371MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-V-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371V371MatMulV,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371V371MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +372,CrossAttention371-FlashAttention-372,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention371FlashAttention372FlashAttention,MXU,1,Compute,7864,7864,3052,0,0,0,0,0,0,0,0,7864,5850,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention371FlashAttention372FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,3900,7864,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.200781281790437,465.68023588504576,0.06868411344504936,0.38806686323753814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +373,CrossAttention371-Attention_output-373,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +374,CrossAttention371-Attention_layernorm-374,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention371Attentionlayernorm374YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention371Attentionlayernorm374YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +375,BasicTransformerBlock-Attn_output_layernorm375,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +376,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +377,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +378,BasicTransformerBlock-FFN376Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +379,SpatialTransformer-Proj_out379,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout379einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout379einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +380,Time-Embed-MLP-Einsum380,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum380einsum,MXU,1,Memory,1275,793,1275,0,0,0,0,0,0,0,0,793,190,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum380einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,793,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2850196078431373,1199.5726940678614,0.004596710479063421,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +381,Conv2d-GroupNorm381,"GroupNorm(x=1x1280x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm381XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,1280,32,32]","[DT_BFLOAT16:(1,1280,32,32)]",10485760,Conv2dGroupNorm381XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +382,Conv2d381Conv2d,"Conv2D(a=1x1280x32x32,b=1280x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d381Conv2dconv2d,MXU,1,Compute,54888,54888,14496,0,0,0,0,0,0,0,0,54888,13714,0,0,18677760,"DT_BFLOAT16:[1,1280,32,32],DT_BFLOAT16:[1280,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",15099494400,Conv2d381Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 32, 32], [1280, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,3600,1,640,1024,11520,0,54888,18677760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14733,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.0964582422387,316.91844358056403,0.9840618498248582,0.26409870298380334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +383,Conv2d-GroupNorm383,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm383XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm383XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +384,Conv2d383Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d383Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d383Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +385,SkipConnection-Einsum380,"XlaEinsum(a=1x32x32x1280,b=1280x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum380einsum,MXU,1,Compute,6126,6126,4324,0,0,0,0,0,0,0,0,6126,1523,0,0,5570560,"DT_BFLOAT16:[1,32,32,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,32,32,640)]",1677721600,SkipConnectionEinsum380einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1280], [1280, 640], [1, 32, 32, 640]]",1,4718592,400,1,1024,640,1280,0,6126,5570560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1833,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,273.8690173032974,846.8802287381652,0.979671106997258,0.705733523948471,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +386,SpatialTransformer-Input_GroupNorm386,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm386XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm386XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +387,SpatialTransformer-Proj_in387,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin387einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin387einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +388,BasicTransformerBlock-Input_layernorm388,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm388XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm388XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-Q-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Q389MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389Q389MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-K-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389K389MatMulK,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389K389MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-V-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389V389MatMulV,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389V389MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +390,SelfAttention389-FlashAttention-390,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention389FlashAttention390FlashAttention,MXU,1,Compute,15666,15666,4070,0,0,0,0,0,0,0,0,15666,11701,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention389FlashAttention390FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,7801,15666,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.276770585982383,311.68214604876806,0.06895593873763159,0.2597351217073067,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +391,SelfAttention389-Attention_output-391,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +392,SelfAttention389-Attention_layernorm-392,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention389Attentionlayernorm392YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention389Attentionlayernorm392YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +393,GatedSelfAttention-Linear393,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear393XLinearcontext,MXU,1,Memory,781,488,781,0,0,0,0,0,0,0,0,488,114,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear393XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,488,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.06955185659411,1199.1141852892927,0.03602031771045856,0.9992618210744105,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-Q-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Q394MatMulQ,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393Q394MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-K-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393K394MatMulK,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393K394MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-V-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393V394MatMulV,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393V394MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +395,GatedSelfAttention-Attn393-FlashAttention-395,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn393FlashAttention395FlashAttention,MXU,1,Compute,19810,19810,4101,0,0,0,0,0,0,0,0,19810,12860,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn393FlashAttention395FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,7924,19810,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.483438263503281,248.4078481906234,0.05538661237803086,0.20700654015885284,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +396,GatedSelfAttention-Attn393-Attention_output-396,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +397,GatedSelfAttention-Attn393-Attention_layernorm-397,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,VPU,1,Memory,2051,1229,2051,0,0,0,0,0,0,0,0,0,1229,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1229,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5762262311067774,1199.6488231731473,0.5990109354321934,0.9997073526442894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +398,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +399,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +400,GatedSelfAttention-FFN393Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +401,BasicTransformerBlock-Fuser_output_layernorm401,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-Q-402,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Q402MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention402Q402MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-K-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402K402MatMulK,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402K402MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-V-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402V402MatMulV,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402V402MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +403,CrossAttention402-FlashAttention-403,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention402FlashAttention403FlashAttention,MXU,1,Compute,7864,7864,3052,0,0,0,0,0,0,0,0,7864,5850,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention402FlashAttention403FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,3900,7864,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.200781281790437,465.68023588504576,0.06868411344504936,0.38806686323753814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +404,CrossAttention402-Attention_output-404,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +405,CrossAttention402-Attention_layernorm-405,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention402Attentionlayernorm405YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention402Attentionlayernorm405YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +406,BasicTransformerBlock-Attn_output_layernorm406,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +407,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +408,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +409,BasicTransformerBlock-FFN407Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +410,SpatialTransformer-Proj_out410,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout410einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout410einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +411,Time-Embed-MLP-Einsum411,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum411einsum,MXU,1,Memory,1275,793,1275,0,0,0,0,0,0,0,0,793,190,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum411einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,793,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2850196078431373,1199.5726940678614,0.004596710479063421,0.9996439117232179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +412,Conv2d-GroupNorm412,"GroupNorm(x=1x960x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm412XnormGroupNormX,VPU,1,Memory,3052,1829,3052,0,0,0,0,0,0,0,0,0,1829,0,0,3932160,"DT_BFLOAT16:[1,960,32,32]","[DT_BFLOAT16:(1,960,32,32)]",7864320,Conv2dGroupNorm412XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1829,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,670,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576775884665793,1199.9047755570118,0.5991387380640331,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +413,Conv2d412Conv2d,"Conv2D(a=1x960x32x32,b=960x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d412Conv2dconv2d,MXU,1,Compute,41479,41479,11127,0,0,0,0,0,0,0,0,41479,10361,0,0,14336000,"DT_BFLOAT16:[1,960,32,32],DT_BFLOAT16:[960,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",11324620800,Conv2d412Conv2dconv2d,Conv2D,11059200,[],Conv2D,bf01;io01->bf01,"[[1, 960, 32, 32], [960, 640, 3, 3], [1, 640, 32, 32]]",1,14589440,2720,1,640,1024,8640,0,41479,14336000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11145,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,273.02058390993034,321.8843373680055,0.9766361317748768,0.26823694780667123,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +414,Conv2d-GroupNorm414,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm414XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm414XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +415,Conv2d414Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d414Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d414Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +416,SkipConnection-Einsum411,"XlaEinsum(a=1x32x32x960,b=960x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum411einsum,MXU,1,Compute,4907,4907,3497,0,0,0,0,0,0,0,0,4907,1219,0,0,4505600,"DT_BFLOAT16:[1,32,32,960],DT_BFLOAT16:[960,640]","[DT_BFLOAT16:(1,32,32,640)]",1258291200,SkipConnectionEinsum411einsum,Einsum,1228800,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 960], [960, 640], [1, 32, 32, 640]]",1,4505600,320,1,1024,640,960,0,4907,4505600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,256.4277970246587,855.13898353118,0.9172812107395358,0.7126158196093166,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +417,SpatialTransformer-Input_GroupNorm417,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm417XnormGroupNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm417XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +418,SpatialTransformer-Proj_in418,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin418einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin418einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +419,BasicTransformerBlock-Input_layernorm419,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm419XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm419XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-Q-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Q420MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420Q420MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-K-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420K420MatMulK,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420K420MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-V-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420V420MatMulV,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420V420MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +421,SelfAttention420-FlashAttention-421,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention420FlashAttention421FlashAttention,MXU,1,Compute,15666,15666,4070,0,0,0,0,0,0,0,0,15666,11701,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention420FlashAttention421FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,7801,15666,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.276770585982383,311.68214604876806,0.06895593873763159,0.2597351217073067,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +422,SelfAttention420-Attention_output-422,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +423,SelfAttention420-Attention_layernorm-423,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention420Attentionlayernorm423YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention420Attentionlayernorm423YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +424,GatedSelfAttention-Linear424,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear424XLinearcontext,MXU,1,Memory,781,488,781,0,0,0,0,0,0,0,0,488,114,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear424XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,488,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.06955185659411,1199.1141852892927,0.03602031771045856,0.9992618210744105,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-Q-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Q425MatMulQ,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424Q425MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-K-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424K425MatMulK,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424K425MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-V-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424V425MatMulV,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424V425MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +426,GatedSelfAttention-Attn424-FlashAttention-426,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn424FlashAttention426FlashAttention,MXU,1,Compute,19810,19810,4101,0,0,0,0,0,0,0,0,19810,12860,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn424FlashAttention426FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,7924,19810,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.483438263503281,248.4078481906234,0.05538661237803086,0.20700654015885284,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +427,GatedSelfAttention-Attn424-Attention_output-427,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,MXU,1,Compute,3460,3460,2687,0,0,0,0,0,0,0,0,3460,857,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,3460,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1052,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,244.3394219653179,931.6240431945448,0.8740392555421458,0.7763533693287873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +428,GatedSelfAttention-Attn424-Attention_layernorm-428,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,VPU,1,Memory,2051,1229,2051,0,0,0,0,0,0,0,0,0,1229,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1229,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5762262311067774,1199.6488231731473,0.5990109354321934,0.9997073526442894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +429,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +430,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +431,GatedSelfAttention-FFN424Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +432,BasicTransformerBlock-Fuser_output_layernorm432,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-Q-433,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Q433MatMulQ,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention433Q433MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-K-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433K433MatMulK,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433K433MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-V-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433V433MatMulV,MXU,1,Memory,1882,1860,1882,0,0,0,0,0,0,0,0,1860,457,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433V433MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1860,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,267.4370244420829,1199.9472801540915,0.9566628907755369,0.9999560667950762,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +434,CrossAttention433-FlashAttention-434,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention433FlashAttention434FlashAttention,MXU,1,Compute,7864,7864,3052,0,0,0,0,0,0,0,0,7864,5850,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention433FlashAttention434FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,3900,7864,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.200781281790437,465.68023588504576,0.06868411344504936,0.38806686323753814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +435,CrossAttention433-Attention_output-435,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +436,CrossAttention433-Attention_layernorm-436,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention433Attentionlayernorm436YnormLayerNormy,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention433Attentionlayernorm436YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +437,BasicTransformerBlock-Attn_output_layernorm437,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,VPU,1,Memory,2035,1220,2035,0,0,0,0,0,0,0,0,0,1220,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1220,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +438,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +439,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,12221,12221,7630,0,0,0,0,0,0,0,0,12221,3047,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,12221,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.5637181900008,749.1427409786434,0.9821561576737096,0.6242856174822028,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +440,BasicTransformerBlock-FFN438Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,2035,153,2035,0,0,0,0,0,0,0,0,0,153,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,153,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +441,SpatialTransformer-Proj_out441,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout441einsum,MXU,1,Compute,3079,3079,2671,0,0,0,0,0,0,0,0,3079,761,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout441einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,3079,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,956,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,272.4458590451445,1040.7098743504384,0.9745802535669377,0.8672582286253654,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample442,"Upsample(a=1x640x32x32,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample442Upsample,VPU,1,Memory,5087,0,5087,0,0,0,0,0,0,0,0,0,0,0,0,6553600,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,64,64)]",0,Upsample442Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1199.826149990171,0.0,0.9998551249918092,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample-Conv2d442Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d442Conv2dconv2d,MXU,1,Compute,27460,27460,7757,0,0,0,0,0,0,0,0,27460,6857,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,UpsampleConv2d442Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,27460,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,274.93616897305174,338.9607184313547,0.9834884707426589,0.2824672653594622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +443,Time-Embed-MLP-Einsum443,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum443einsum,MXU,1,Memory,639,488,639,0,0,0,0,0,0,0,0,488,114,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum443einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,488,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.282003129890454,1198.6223558118459,0.004585920078877826,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +444,Conv2d-GroupNorm444,"GroupNorm(x=1x960x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm444XnormGroupNormX,VPU,1,Memory,12208,7315,12208,0,0,0,0,0,0,0,0,0,7315,0,0,15728640,"DT_BFLOAT16:[1,960,64,64]","[DT_BFLOAT16:(1,960,64,64)]",31457280,Conv2dGroupNorm444XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,7315,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2680,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576775884665793,1199.9047755570118,0.5991387380640331,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +445,Conv2d444Conv2d,"Conv2D(a=1x960x64x64,b=960x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d444Conv2dconv2d,MXU,1,Compute,99505,99505,12430,0,0,0,0,0,0,0,0,99505,24868,0,0,16015360,"DT_BFLOAT16:[1,960,64,64],DT_BFLOAT16:[960,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",22649241600,Conv2d444Conv2dconv2d,Conv2D,5529600,[],Conv2D,bf01;io01->bf01,"[[1, 960, 64, 64], [960, 320, 3, 3], [1, 320, 64, 64]]",1,16514560,6528,1,320,4096,8640,0,99505,16015360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25743,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,227.6191306969499,149.89665151091654,0.8142282319459346,0.12491387625909711,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +446,Conv2d-GroupNorm446,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm446XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm446XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +447,Conv2d446Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d446Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d446Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,SkipConnection-Einsum443,"XlaEinsum(a=1x64x64x960,b=960x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum443einsum,MXU,1,Compute,11734,11734,8615,0,0,0,0,0,0,0,0,11734,2925,0,0,11100160,"DT_BFLOAT16:[1,64,64,960],DT_BFLOAT16:[960,320]","[DT_BFLOAT16:(1,64,64,320)]",2516582400,SkipConnectionEinsum443einsum,Einsum,614400,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 960], [960, 320], [1, 64, 64, 320]]",1,11100160,768,1,4096,320,960,0,11734,11100160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3534,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,214.46926879154594,881.0149641932632,0.7671891769386231,0.7341791368277193,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +449,SpatialTransformer-Input_GroupNorm449,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm449XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm449XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +450,SpatialTransformer-Proj_in450,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin450einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin450einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +451,BasicTransformerBlock-Input_layernorm451,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm451XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm451XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-Q-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Q452MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452Q452MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-K-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452K452MatMulK,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452K452MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-V-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452V452MatMulV,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452V452MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +453,SelfAttention452-FlashAttention-453,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention452FlashAttention453FlashAttention,MXU,1,Compute,249722,249722,8139,0,0,0,0,0,0,0,0,249722,187244,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention452FlashAttention453FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,124830,249722,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,62998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.348868774076774,39.10598585627218,0.0692138449164262,0.03258832154689348,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +454,SelfAttention452-Attention_output-454,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +455,SelfAttention452-Attention_layernorm-455,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention452Attentionlayernorm455YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention452Attentionlayernorm455YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +456,GatedSelfAttention-Linear456,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear456XLinearcontext,MXU,1,Memory,500,305,500,0,0,0,0,0,0,0,0,305,68,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear456XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,305,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.028131868131868135,0.7899602254231771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-Q-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Q457MatMulQ,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456Q457MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-K-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456K457MatMulK,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456K457MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-V-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456V457MatMulV,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456V457MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +458,GatedSelfAttention-Attn456-FlashAttention-458,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn456FlashAttention458FlashAttention,MXU,1,Compute,265570,265570,8154,0,0,0,0,0,0,0,0,265570,191695,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn456FlashAttention458FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,125319,265570,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,18.265357562977748,36.84414085298838,0.06533796060474527,0.03070345071082365,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +459,GatedSelfAttention-Attn456-Attention_output-459,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +460,GatedSelfAttention-Attn456-Attention_layernorm-460,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,VPU,1,Memory,4077,2443,4077,0,0,0,0,0,0,0,0,0,2443,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2443,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576953642384106,1199.9875504449503,0.5991800693787449,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +461,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +462,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +463,GatedSelfAttention-FFN456Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +464,BasicTransformerBlock-Fuser_output_layernorm464,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-Q-465,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Q465MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention465Q465MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-K-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465K465MatMulK,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465K465MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-V-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465V465MatMulV,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465V465MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +466,CrossAttention465-FlashAttention-466,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention465FlashAttention466FlashAttention,MXU,1,Compute,31270,31270,4578,0,0,0,0,0,0,0,0,31270,23403,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention465FlashAttention466FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,15603,31270,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.31499123760793,175.66882195394948,0.06909265981859523,0.14639068496162455,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +467,CrossAttention465-Attention_output-467,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +468,CrossAttention465-Attention_layernorm-468,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention465Attentionlayernorm468YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention465Attentionlayernorm468YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +469,BasicTransformerBlock-Attn_output_layernorm469,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +470,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +471,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +472,BasicTransformerBlock-FFN470Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +473,SpatialTransformer-Proj_out473,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout473einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout473einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +474,Time-Embed-MLP-Einsum474,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum474einsum,MXU,1,Memory,639,488,639,0,0,0,0,0,0,0,0,488,114,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum474einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,488,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.282003129890454,1198.6223558118459,0.004585920078877826,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +475,Conv2d-GroupNorm475,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm475XnormGroupNormX,VPU,1,Memory,8139,4877,8139,0,0,0,0,0,0,0,0,0,4877,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm475XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,4877,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1786,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5766703526231725,1199.855633370193,0.5991142002937064,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +476,Conv2d475Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d475Conv2dconv2d,MXU,1,Compute,65860,65860,8965,0,0,0,0,0,0,0,0,65860,16457,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d475Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,65860,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,229.26654114788946,163.33808516645158,0.8201212695594718,0.13611507097204298,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +477,Conv2d-GroupNorm477,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm477XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm477XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +478,Conv2d477Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d477Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d477Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +479,SkipConnection-Einsum474,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum474einsum,MXU,1,Compute,7345,7345,6422,0,0,0,0,0,0,0,0,7345,1828,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum474einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,7345,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2284,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.41682777399592,1049.106668013955,0.8170817156521719,0.8742555566782958,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +480,SpatialTransformer-Input_GroupNorm480,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm480XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm480XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +481,SpatialTransformer-Proj_in481,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin481einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin481einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +482,BasicTransformerBlock-Input_layernorm482,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm482XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm482XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-Q-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Q483MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483Q483MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-K-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483K483MatMulK,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483K483MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-V-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483V483MatMulV,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483V483MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +484,SelfAttention483-FlashAttention-484,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention483FlashAttention484FlashAttention,MXU,1,Compute,249722,249722,8139,0,0,0,0,0,0,0,0,249722,187244,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention483FlashAttention484FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,124830,249722,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,62998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.348868774076774,39.10598585627218,0.0692138449164262,0.03258832154689348,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +485,SelfAttention483-Attention_output-485,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +486,SelfAttention483-Attention_layernorm-486,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention483Attentionlayernorm486YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention483Attentionlayernorm486YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +487,GatedSelfAttention-Linear487,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear487XLinearcontext,MXU,1,Memory,500,305,500,0,0,0,0,0,0,0,0,305,68,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear487XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,305,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.028131868131868135,0.7899602254231771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-Q-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Q488MatMulQ,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487Q488MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-K-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487K488MatMulK,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487K488MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-V-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487V488MatMulV,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487V488MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +489,GatedSelfAttention-Attn487-FlashAttention-489,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn487FlashAttention489FlashAttention,MXU,1,Compute,265570,265570,8154,0,0,0,0,0,0,0,0,265570,191695,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn487FlashAttention489FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,125319,265570,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,18.265357562977748,36.84414085298838,0.06533796060474527,0.03070345071082365,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +490,GatedSelfAttention-Attn487-Attention_output-490,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +491,GatedSelfAttention-Attn487-Attention_layernorm-491,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,VPU,1,Memory,4077,2443,4077,0,0,0,0,0,0,0,0,0,2443,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2443,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576953642384106,1199.9875504449503,0.5991800693787449,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +492,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +493,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +494,GatedSelfAttention-FFN487Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +495,BasicTransformerBlock-Fuser_output_layernorm495,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-Q-496,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Q496MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention496Q496MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-K-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496K496MatMulK,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496K496MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-V-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496V496MatMulV,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496V496MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +497,CrossAttention496-FlashAttention-497,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention496FlashAttention497FlashAttention,MXU,1,Compute,31270,31270,4578,0,0,0,0,0,0,0,0,31270,23403,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention496FlashAttention497FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,15603,31270,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.31499123760793,175.66882195394948,0.06909265981859523,0.14639068496162455,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +498,CrossAttention496-Attention_output-498,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +499,CrossAttention496-Attention_layernorm-499,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention496Attentionlayernorm499YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention496Attentionlayernorm499YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +500,BasicTransformerBlock-Attn_output_layernorm500,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +501,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +502,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +503,BasicTransformerBlock-FFN501Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +504,SpatialTransformer-Proj_out504,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout504einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout504einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +505,Time-Embed-MLP-Einsum505,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum505einsum,MXU,1,Memory,639,488,639,0,0,0,0,0,0,0,0,488,114,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum505einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,488,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.282003129890454,1198.6223558118459,0.004585920078877826,0.9988519631765382,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +506,Conv2d-GroupNorm506,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm506XnormGroupNormX,VPU,1,Memory,8139,4877,8139,0,0,0,0,0,0,0,0,0,4877,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm506XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,4877,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1786,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5766703526231725,1199.855633370193,0.5991142002937064,0.9998796944751608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +507,Conv2d506Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d506Conv2dconv2d,MXU,1,Compute,65860,65860,8965,0,0,0,0,0,0,0,0,65860,16457,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d506Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,65860,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,229.26654114788946,163.33808516645158,0.8201212695594718,0.13611507097204298,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +508,Conv2d-GroupNorm508,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm508XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm508XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +509,Conv2d508Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d508Conv2dconv2d,MXU,1,Compute,33677,33677,5500,0,0,0,0,0,0,0,0,33677,8411,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d508Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,33677,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,224.18110876859578,195.96241558129435,0.8019299048785049,0.16330201298441197,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +510,SkipConnection-Einsum505,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum505einsum,MXU,1,Compute,7345,7345,6422,0,0,0,0,0,0,0,0,7345,1828,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum505einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,7345,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2284,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.41682777399592,1049.106668013955,0.8170817156521719,0.8742555566782958,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +511,SpatialTransformer-Input_GroupNorm511,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm511XnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm511XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +512,SpatialTransformer-Proj_in512,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin512einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin512einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +513,BasicTransformerBlock-Input_layernorm513,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm513XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm513XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-Q-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Q514MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514Q514MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-K-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514K514MatMulK,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514K514MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-V-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514V514MatMulV,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514V514MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +515,SelfAttention514-FlashAttention-515,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention514FlashAttention515FlashAttention,MXU,1,Compute,249722,249722,8139,0,0,0,0,0,0,0,0,249722,187244,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention514FlashAttention515FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,124830,249722,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,62998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.348868774076774,39.10598585627218,0.0692138449164262,0.03258832154689348,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +516,SelfAttention514-Attention_output-516,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +517,SelfAttention514-Attention_layernorm-517,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention514Attentionlayernorm517YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention514Attentionlayernorm517YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +518,GatedSelfAttention-Linear518,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear518XLinearcontext,MXU,1,Memory,500,305,500,0,0,0,0,0,0,0,0,305,68,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear518XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,305,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.028131868131868135,0.7899602254231771,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-Q-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Q519MatMulQ,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518Q519MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-K-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518K519MatMulK,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518K519MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-V-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518V519MatMulV,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518V519MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +520,GatedSelfAttention-Attn518-FlashAttention-520,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn518FlashAttention520FlashAttention,MXU,1,Compute,265570,265570,8154,0,0,0,0,0,0,0,0,265570,191695,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn518FlashAttention520FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,125319,265570,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,18.265357562977748,36.84414085298838,0.06533796060474527,0.03070345071082365,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +521,GatedSelfAttention-Attn518-Attention_output-521,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,MXU,1,Compute,4557,4557,4236,0,0,0,0,0,0,0,0,4557,1131,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,4557,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1434,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.44134298880843,1115.4452724260068,0.6597747216575394,0.9295377270216724,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +522,GatedSelfAttention-Attn518-Attention_layernorm-522,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,VPU,1,Memory,4077,2443,4077,0,0,0,0,0,0,0,0,0,2443,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2443,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576953642384106,1199.9875504449503,0.5991800693787449,0.999989625370792,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +523,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +524,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +525,GatedSelfAttention-FFN518Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +526,BasicTransformerBlock-Fuser_output_layernorm526,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-Q-527,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Q527MatMulQ,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention527Q527MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-K-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527K527MatMulK,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527K527MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-V-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527V527MatMulV,MXU,1,Memory,1247,1128,1247,0,0,0,0,0,0,0,0,1128,274,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527V527MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,1128,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,201.81093825180434,1199.1670634522854,0.7219084043462554,0.9993058862102379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +528,CrossAttention527-FlashAttention-528,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention527FlashAttention528FlashAttention,MXU,1,Compute,31270,31270,4578,0,0,0,0,0,0,0,0,31270,23403,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention527FlashAttention528FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,15603,31270,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,19.31499123760793,175.66882195394948,0.06909265981859523,0.14639068496162455,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +529,CrossAttention527-Attention_output-529,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +530,CrossAttention527-Attention_layernorm-530,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention527Attentionlayernorm530YnormLayerNormy,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention527Attentionlayernorm530YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +531,BasicTransformerBlock-Attn_output_layernorm531,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +532,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +533,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,14660,14660,10809,0,0,0,0,0,0,0,0,14660,3657,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,14660,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,228.88425648021828,884.7183289989769,0.8187537791903414,0.737265274165814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +534,BasicTransformerBlock-FFN532Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,4070,305,4070,0,0,0,0,0,0,0,0,0,305,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,305,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32204422604422606,1199.708230958231,0.07488007488007489,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +535,SpatialTransformer-Proj_out535,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout535einsum,MXU,1,Compute,4420,4420,4228,0,0,0,0,0,0,0,0,4420,1097,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout535einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,4420,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,189.7875113122172,1147.8613944075225,0.6788987784462899,0.9565511620062688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +536,Out536-GroupNorm,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Out536GroupNormXnormGroupNormX,VPU,1,Memory,4070,2439,4070,0,0,0,0,0,0,0,0,0,2439,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Out536GroupNormXnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2439,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,893,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.5763538083538084,1199.708230958231,0.5990405990405991,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +537,Out537-Conv2d,"Conv2D(a=1x320x64x64,b=320x3x3x3,c=1x3x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Out537Conv2dconv2d,MXU,1,Compute,11246,11246,2067,0,0,0,0,0,0,0,0,11246,2803,0,0,2663296,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,3,3,3]","[DT_BFLOAT16:(1,3,64,64)]",70778880,Out537Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 3, 3, 3], [1, 3, 64, 64]]",1,2829696,736,1,3,4096,2880,0,11246,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2955,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.293693757780544,220.55732595439315,0.022513499305247486,0.18379777162866096,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json new file mode 100644 index 0000000..1fd1ca2 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.json @@ -0,0 +1,184 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 6989769, + "overlapped_compute_time_non_pp_ns": 2612482, + "compute_only_time_non_pp_ns": 3796358, + "memory_only_time_non_pp_ns": 580929, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 6989769, + "overlapped_compute_time_chip_ns": 2612482, + "compute_only_time_chip_ns": 3796358, + "memory_only_time_chip_ns": 580929, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 3.5766561098084932, + "throughput_step_per_sec_per_request": 143.0662443923397, + "latency_sec": 0.27959076, + "latency_step_sec": 0.000174744225, + "mem_footprint_GB": 31.999999046325684, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "gligen", + "model_type": "gligen", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "num_diffusion_steps": 1, + "total_num_diffusion_steps": 40, + "image_resolution": [ + 512, + 512 + ], + "image_num_channels": 3, + "use_flash_attention": true, + "fourier_embedder_config": { + "num_freqs": 64 + }, + "text_embedder_config": { + "d_model": 512, + "num_heads": 8, + "d_head": 64, + "d_ff": 2048, + "num_layers": 12, + "ffn_type": "default" + }, + "image_embedder_config": { + "model_type": "vit", + "patch_size": 2, + "d_model": 1024, + "num_heads": 16, + "d_head": 64, + "d_ff": 4096, + "num_layers": 24, + "ffn_type": "default" + }, + "spatial_condition_embedder_config": { + "model_type": "convnext", + "stem": { + "in_channels": 3, + "out_channels": 96, + "kernel_size": 4, + "stride": 4 + }, + "depths": [ + 3, + 3, + 9, + 3 + ], + "dims": [ + 96, + 192, + 384, + 768 + ] + }, + "grounding_input_config": { + "text": { + "input_seqlen": 512, + "feature_dim": 768 + }, + "bbox": { + "input_seqlen": 8, + "feature_dim": 4, + "grounding_token_feature_dim": 768 + }, + "image": { + "resolution": [ + 1024, + 1024 + ], + "image_num_channels": 3 + }, + "keypoint": { + "num_persons": 10, + "num_keypoints": 17, + "feature_dim": 256 + }, + "spatial_condition": { + "resolution": [ + 256, + 256 + ], + "num_channels": 1 + } + }, + "unet_config": { + "noisy_latent_resolution": [ + 64, + 64 + ], + "model_channels": 320, + "attention_resolutions": [ + 4, + 2, + 1 + ], + "num_res_blocks": 2, + "channel_mult": [ + 1, + 2, + 4, + 4 + ], + "num_heads": 8, + "context_dim": 768 + }, + "output_dir": "./llava_ops" + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv new file mode 100644 index 0000000..23ba996 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv @@ -0,0 +1,635 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Time-Embed-MLP-FFi2,"XlaEinsum(a=1x320,b=320x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFi2einsum,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,822400,"DT_BFLOAT16:[1,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,1280)]",819200,TimeEmbedMLPFFi2einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 320], [320, 1280], [1, 1280]]",1,822400,30,1,1,1280,320,0,302,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0035922766052986083,0.5540106223246073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Time-Embed-MLP-FFo2,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFo2einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPFFo2einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Conv2d5Conv2d,"Conv2D(a=1x3x64x64,b=3x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d5Conv2dconv2d,MXU,1,Compute,923,923,898,0,0,0,0,0,0,0,0,923,150,0,0,2663296,"DT_BFLOAT16:[1,3,64,64],DT_BFLOAT16:[3,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",70778880,Conv2d5Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 3, 64, 64], [3, 320, 3, 3], [1, 320, 64, 64]]",1,2664856,96,1,320,4096,27,0,923,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,213,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,76.68351029252437,2687.310604207048,0.1681325561743227,0.9719025693334713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Time-Embed-MLP-Einsum6,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum6einsum,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum6einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,302,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0035922766052986083,0.5540106223246073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Conv2d-GroupNorm7,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm7XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm7XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Conv2d7Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d7Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d7Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Conv2d-GroupNorm9,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm9XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm9XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Conv2d9Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d9Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d9Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,SpatialTransformer-Input_GroupNorm11,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm11XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm11XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,SpatialTransformer-Proj_in12,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin12einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin12einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,BasicTransformerBlock-Input_layernorm13,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm13XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm13XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-Q-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Q14MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14Q14MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-K-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14K14MatMulK,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14K14MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-V-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14V14MatMulV,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14V14MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,SelfAttention14-FlashAttention-15,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention14FlashAttention15FlashAttention,MXU,1,Compute,154240,154240,3532,0,0,0,0,0,0,0,0,154240,77101,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention14FlashAttention15FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,51401,154240,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.32675186721992,63.31447743775934,0.0686855211502738,0.022898545185446414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,SelfAttention14-Attention_output-16,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,SelfAttention14-Attention_layernorm-17,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention14Attentionlayernorm17YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention14Attentionlayernorm17YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,GatedSelfAttention-Linear18,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear18XLinearcontext,MXU,1,Memory,500,47,500,0,0,0,0,0,0,0,0,0,47,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear18XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,47,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.01724292770543332,0.3428398808346519,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-Q-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Q19MatMulQ,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18Q19MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-K-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18K19MatMulK,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18K19MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-V-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18V19MatMulV,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18V19MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,GatedSelfAttention-Attn18-FlashAttention-20,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn18FlashAttention20FlashAttention,MXU,1,Compute,164030,164030,3539,0,0,0,0,0,0,0,0,164030,78931,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn18FlashAttention20FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,51601,164030,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,29.57221854538804,59.65188371839374,0.06483861623985296,0.021573918162167718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,GatedSelfAttention-Attn18-Attention_output-21,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,GatedSelfAttention-Attn18-Attention_layernorm-22,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,VPU,1,Memory,1770,1006,1770,0,0,0,0,0,0,0,0,0,1006,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1006,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.935728813559322,2764.039120431674,0.5682951146560319,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,GatedSelfAttention-FFN18Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,BasicTransformerBlock-Fuser_output_layernorm26,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-Q-27,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Q27MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention27Q27MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-K-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27K27MatMulK,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27K27MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-V-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27V27MatMulV,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27V27MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,CrossAttention27-FlashAttention-28,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention27FlashAttention28FlashAttention,MXU,1,Compute,19314,19314,1987,0,0,0,0,0,0,0,0,19314,9637,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention27FlashAttention28FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,6425,19314,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.271604846225536,284.4135892357875,0.06856460845900791,0.10286205758979658,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,CrossAttention27-Attention_output-29,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,CrossAttention27-Attention_layernorm-30,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention27Attentionlayernorm30YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention27Attentionlayernorm30YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +31,BasicTransformerBlock-Attn_output_layernorm31,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +32,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +33,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +34,BasicTransformerBlock-FFN32Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +35,SpatialTransformer-Proj_out35,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout35einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout35einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +36,Time-Embed-MLP-Einsum36,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum36einsum,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum36einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,302,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0035922766052986083,0.5540106223246073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +37,Conv2d-GroupNorm37,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm37XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm37XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +38,Conv2d37Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d37Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d37Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +39,Conv2d-GroupNorm39,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm39XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm39XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +40,Conv2d39Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d39Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d39Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +41,SpatialTransformer-Input_GroupNorm41,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm41XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm41XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +42,SpatialTransformer-Proj_in42,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin42einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin42einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +43,BasicTransformerBlock-Input_layernorm43,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm43XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm43XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-Q-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Q44MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44Q44MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-K-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44K44MatMulK,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44K44MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-V-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44V44MatMulV,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44V44MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +45,SelfAttention44-FlashAttention-45,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention44FlashAttention45FlashAttention,MXU,1,Compute,154240,154240,3532,0,0,0,0,0,0,0,0,154240,77101,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention44FlashAttention45FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,51401,154240,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.32675186721992,63.31447743775934,0.0686855211502738,0.022898545185446414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +46,SelfAttention44-Attention_output-46,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +47,SelfAttention44-Attention_layernorm-47,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention44Attentionlayernorm47YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention44Attentionlayernorm47YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +48,GatedSelfAttention-Linear48,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear48XLinearcontext,MXU,1,Memory,500,47,500,0,0,0,0,0,0,0,0,0,47,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear48XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,47,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.01724292770543332,0.3428398808346519,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-Q-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Q49MatMulQ,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48Q49MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-K-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48K49MatMulK,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48K49MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-V-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48V49MatMulV,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48V49MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +50,GatedSelfAttention-Attn48-FlashAttention-50,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn48FlashAttention50FlashAttention,MXU,1,Compute,164030,164030,3539,0,0,0,0,0,0,0,0,164030,78931,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn48FlashAttention50FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,51601,164030,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,29.57221854538804,59.65188371839374,0.06483861623985296,0.021573918162167718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +51,GatedSelfAttention-Attn48-Attention_output-51,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +52,GatedSelfAttention-Attn48-Attention_layernorm-52,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,VPU,1,Memory,1770,1006,1770,0,0,0,0,0,0,0,0,0,1006,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1006,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.935728813559322,2764.039120431674,0.5682951146560319,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +53,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +54,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +55,GatedSelfAttention-FFN48Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,BasicTransformerBlock-Fuser_output_layernorm56,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-Q-57,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Q57MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention57Q57MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-K-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57K57MatMulK,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57K57MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-V-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57V57MatMulV,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57V57MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +58,CrossAttention57-FlashAttention-58,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention57FlashAttention58FlashAttention,MXU,1,Compute,19314,19314,1987,0,0,0,0,0,0,0,0,19314,9637,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention57FlashAttention58FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,6425,19314,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.271604846225536,284.4135892357875,0.06856460845900791,0.10286205758979658,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +59,CrossAttention57-Attention_output-59,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +60,CrossAttention57-Attention_layernorm-60,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention57Attentionlayernorm60YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention57Attentionlayernorm60YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +61,BasicTransformerBlock-Attn_output_layernorm61,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +62,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +63,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +64,BasicTransformerBlock-FFN62Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +65,SpatialTransformer-Proj_out65,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout65einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout65einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +66,Downsample-Conv2d66Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d66Conv2dconv2d,MXU,1,Compute,5215,5215,1725,0,0,0,0,0,0,0,0,5215,865,0,0,5120000,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,32,32)]",1887436800,DownsampleConv2d66Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 32, 32]]",1,5286400,552,1,320,1024,2880,0,5215,5120000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.9246021093001,914.3569668324544,0.7935383795405555,0.33068968059039944,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +67,Time-Embed-MLP-Einsum67,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum67einsum,MXU,1,Memory,554,490,554,0,0,0,0,0,0,0,0,490,78,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum67einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,490,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9574007220216605,2760.7494312933636,0.006484253800177993,0.9984627237950682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +68,Conv2d-GroupNorm68,"GroupNorm(x=1x320x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm68XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,320,32,32]","[DT_BFLOAT16:(1,320,32,32)]",2621440,Conv2dGroupNorm68XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +69,Conv2d68Conv2d,"Conv2D(a=1x320x32x32,b=320x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d68Conv2dconv2d,MXU,1,Compute,8678,8678,1904,0,0,0,0,0,0,0,0,8678,1443,0,0,5652480,"DT_BFLOAT16:[1,320,32,32],DT_BFLOAT16:[320,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",3774873600,Conv2d68Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 320, 32, 32], [320, 640, 3, 3], [1, 640, 32, 32]]",1,5736960,920,1,640,1024,2880,0,8678,5652480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,434.9935008066375,606.6239025769187,0.9537457131375886,0.2193938164835149,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +70,Conv2d-GroupNorm70,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm70XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm70XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +71,Conv2d70Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d70Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d70Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +72,SkipConnection-Einsum67,"XlaEinsum(a=1x32x32x320,b=320x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum67einsum,MXU,1,Compute,1149,1149,801,0,0,0,0,0,0,0,0,1149,188,0,0,2375680,"DT_BFLOAT16:[1,32,32,320],DT_BFLOAT16:[320,640]","[DT_BFLOAT16:(1,32,32,640)]",419430400,SkipConnectionEinsum67einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 320], [320, 640], [1, 32, 32, 640]]",1,2375680,120,1,1024,640,320,0,1149,2375680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,365.03951261966927,1925.6087154590948,0.8003679816853296,0.6964226819020234,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +73,SpatialTransformer-Input_GroupNorm73,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm73XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm73XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +74,SpatialTransformer-Proj_in74,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin74einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin74einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +75,BasicTransformerBlock-Input_layernorm75,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm75XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm75XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-Q-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Q76MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76Q76MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-K-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76K76MatMulK,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76K76MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-V-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76V76MatMulV,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76V76MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +77,SelfAttention76-FlashAttention-77,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention76FlashAttention77FlashAttention,MXU,1,Compute,9676,9676,1766,0,0,0,0,0,0,0,0,9676,4818,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention76FlashAttention77FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,3212,9676,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1729,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.210199255890863,504.6313042579578,0.06842997353127733,0.18250680081662127,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +78,SelfAttention76-Attention_output-78,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +79,SelfAttention76-Attention_layernorm-79,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention76Attentionlayernorm79YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention76Attentionlayernorm79YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +80,GatedSelfAttention-Linear80,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear80XLinearcontext,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear80XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,302,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.03448585541086664,0.677401937584765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-Q-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Q81MatMulQ,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80Q81MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-K-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80K81MatMulK,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80K81MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-V-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80V81MatMulV,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80V81MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +82,GatedSelfAttention-Attn80-FlashAttention-82,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn80FlashAttention82FlashAttention,MXU,1,Compute,12236,12236,1780,0,0,0,0,0,0,0,0,12236,5294,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn80FlashAttention82FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,3262,12236,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.067580254985288,402.1706009035837,0.054961964173235456,0.14545048857272466,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +83,GatedSelfAttention-Attn80-Attention_output-83,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +84,GatedSelfAttention-Attn80-Attention_layernorm-84,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,VPU,1,Memory,890,506,890,0,0,0,0,0,0,0,0,0,506,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,506,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.936898876404494,2764.5839734023875,0.5684071381361533,0.9998495383010443,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +85,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +86,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +87,GatedSelfAttention-FFN80Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +88,BasicTransformerBlock-Fuser_output_layernorm88,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-Q-89,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Q89MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention89Q89MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-K-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89K89MatMulK,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89K89MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-V-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89V89MatMulV,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89V89MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +90,CrossAttention89-FlashAttention-90,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention89FlashAttention90FlashAttention,MXU,1,Compute,4858,4858,1325,0,0,0,0,0,0,0,0,4858,2408,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention89FlashAttention90FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,1606,4858,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.08170934540963,753.8306659118979,0.0681482527674598,0.2726331522285345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +91,CrossAttention89-Attention_output-91,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +92,CrossAttention89-Attention_layernorm-92,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention89Attentionlayernorm92YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention89Attentionlayernorm92YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +93,BasicTransformerBlock-Attn_output_layernorm93,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +94,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +95,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +96,BasicTransformerBlock-FFN94Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +97,SpatialTransformer-Proj_out97,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout97einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout97einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +98,Time-Embed-MLP-Einsum98,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum98einsum,MXU,1,Memory,554,490,554,0,0,0,0,0,0,0,0,490,78,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum98einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,490,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9574007220216605,2760.7494312933636,0.006484253800177993,0.9984627237950682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +99,Conv2d-GroupNorm99,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm99XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm99XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +100,Conv2d99Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d99Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d99Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +101,Conv2d-GroupNorm101,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm101XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm101XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +102,Conv2d101Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d101Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d101Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +103,SpatialTransformer-Input_GroupNorm103,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm103XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm103XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +104,SpatialTransformer-Proj_in104,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin104einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin104einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +105,BasicTransformerBlock-Input_layernorm105,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm105XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm105XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-Q-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Q106MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106Q106MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-K-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106K106MatMulK,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106K106MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-V-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106V106MatMulV,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106V106MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +107,SelfAttention106-FlashAttention-107,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention106FlashAttention107FlashAttention,MXU,1,Compute,9676,9676,1766,0,0,0,0,0,0,0,0,9676,4818,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention106FlashAttention107FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,3212,9676,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1729,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.210199255890863,504.6313042579578,0.06842997353127733,0.18250680081662127,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +108,SelfAttention106-Attention_output-108,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +109,SelfAttention106-Attention_layernorm-109,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention106Attentionlayernorm109YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention106Attentionlayernorm109YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +110,GatedSelfAttention-Linear110,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear110XLinearcontext,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear110XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,302,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.03448585541086664,0.677401937584765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-Q-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Q111MatMulQ,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110Q111MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-K-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110K111MatMulK,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110K111MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-V-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110V111MatMulV,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110V111MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,GatedSelfAttention-Attn110-FlashAttention-112,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn110FlashAttention112FlashAttention,MXU,1,Compute,12236,12236,1780,0,0,0,0,0,0,0,0,12236,5294,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn110FlashAttention112FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,3262,12236,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.067580254985288,402.1706009035837,0.054961964173235456,0.14545048857272466,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +113,GatedSelfAttention-Attn110-Attention_output-113,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +114,GatedSelfAttention-Attn110-Attention_layernorm-114,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,VPU,1,Memory,890,506,890,0,0,0,0,0,0,0,0,0,506,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,506,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.936898876404494,2764.5839734023875,0.5684071381361533,0.9998495383010443,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +115,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +116,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +117,GatedSelfAttention-FFN110Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +118,BasicTransformerBlock-Fuser_output_layernorm118,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-Q-119,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Q119MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention119Q119MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-K-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119K119MatMulK,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119K119MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-V-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119V119MatMulV,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119V119MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +120,CrossAttention119-FlashAttention-120,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention119FlashAttention120FlashAttention,MXU,1,Compute,4858,4858,1325,0,0,0,0,0,0,0,0,4858,2408,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention119FlashAttention120FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,1606,4858,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.08170934540963,753.8306659118979,0.0681482527674598,0.2726331522285345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +121,CrossAttention119-Attention_output-121,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +122,CrossAttention119-Attention_layernorm-122,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention119Attentionlayernorm122YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention119Attentionlayernorm122YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +123,BasicTransformerBlock-Attn_output_layernorm123,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +124,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +125,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +126,BasicTransformerBlock-FFN124Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +127,SpatialTransformer-Proj_out127,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout127einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout127einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +128,Downsample-Conv2d128Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d128Conv2dconv2d,MXU,1,Compute,4255,4255,3036,0,0,0,0,0,0,0,0,4255,705,0,0,9011200,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,16,16)]",1887436800,DownsampleConv2d128Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 16, 16]]",1,9180160,450,1,640,256,5760,0,4255,9011200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,910,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,443.5809165687426,1972.3464123090482,0.9725740656413623,0.7133260080683719,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +129,Time-Embed-MLP-Einsum129,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum129einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum129einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +130,Conv2d-GroupNorm130,"GroupNorm(x=1x640x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm130XnormGroupNormX,VPU,1,Memory,500,126,500,0,0,0,0,0,0,0,0,0,126,0,0,655360,"DT_BFLOAT16:[1,640,16,16]","[DT_BFLOAT16:(1,640,16,16)]",1310720,Conv2dGroupNorm130XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,126,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.25098039215686274,0.4414839511754069,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +131,Conv2d130Conv2d,"Conv2D(a=1x640x16x16,b=640x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d130Conv2dconv2d,MXU,1,Compute,8490,8490,5298,0,0,0,0,0,0,0,0,8490,1411,0,0,15728640,"DT_BFLOAT16:[1,640,16,16],DT_BFLOAT16:[640,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",3774873600,Conv2d130Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 640, 16, 16], [640, 1280, 3, 3], [1, 1280, 16, 16]]",1,15815680,900,1,1280,256,5760,0,8490,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1765,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.62586572438164,1725.375441696113,0.9748651706252054,0.6240055847002217,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +132,Conv2d-GroupNorm132,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm132XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm132XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +133,Conv2d132Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d132Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d132Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +134,SkipConnection-Einsum129,"XlaEinsum(a=1x16x16x640,b=640x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum129einsum,MXU,1,Compute,960,960,883,0,0,0,0,0,0,0,0,960,157,0,0,2621440,"DT_BFLOAT16:[1,16,16,640],DT_BFLOAT16:[640,1280]","[DT_BFLOAT16:(1,16,16,1280)]",419430400,SkipConnectionEinsum129einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 640], [640, 1280], [1, 16, 16, 1280]]",1,2621440,100,1,256,1280,640,0,960,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,218,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,436.9066666666667,2543.1315104166665,0.957940428079629,0.9197582316154309,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +135,SpatialTransformer-Input_GroupNorm135,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm135XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm135XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +136,SpatialTransformer-Proj_in136,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin136einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin136einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +137,BasicTransformerBlock-Input_layernorm137,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm137XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm137XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-Q-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Q138MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138Q138MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-K-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138K138MatMulK,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138K138MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-V-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138V138MatMulV,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138V138MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +139,SelfAttention138-FlashAttention-139,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention138FlashAttention139FlashAttention,MXU,1,Compute,1244,1244,883,0,0,0,0,0,0,0,0,1244,401,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention138FlashAttention139FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,201,1244,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.17232154340836,1962.5452170418007,0.03326609846707393,0.7097812719861847,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +140,SelfAttention138-Attention_output-140,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +141,SelfAttention138-Attention_layernorm-141,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention138Attentionlayernorm141YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention138Attentionlayernorm141YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +142,GatedSelfAttention-Linear142,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear142XLinearcontext,MXU,1,Memory,674,584,674,0,0,0,0,0,0,0,0,584,94,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear142XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,584,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,141,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,23.336261127596437,2761.97665522997,0.05116595758288819,0.9989065660867885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-Q-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Q143MatMulQ,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142Q143MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-K-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142K143MatMulK,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142K143MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-V-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142V143MatMulV,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142V143MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +144,GatedSelfAttention-Attn142-FlashAttention-144,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn142FlashAttention144FlashAttention,MXU,1,Compute,2750,2750,911,0,0,0,0,0,0,0,0,2750,663,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn142FlashAttention144FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,213,2750,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.299072,915.52734375,0.016003592276605298,0.33111296338155516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +145,GatedSelfAttention-Attn142-Attention_output-145,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +146,GatedSelfAttention-Attn142-Attention_layernorm-146,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,VPU,1,Memory,500,259,500,0,0,0,0,0,0,0,0,0,259,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,259,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.5176470588235293,0.9105606492992767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +147,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +148,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +149,GatedSelfAttention-FFN142Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +150,BasicTransformerBlock-Fuser_output_layernorm150,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-Q-151,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Q151MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention151Q151MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-K-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151K151MatMulK,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151K151MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-V-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151V151MatMulV,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151V151MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +152,CrossAttention151-FlashAttention-152,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention151FlashAttention152FlashAttention,MXU,1,Compute,2448,2448,1325,0,0,0,0,0,0,0,0,2448,803,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention151FlashAttention152FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,401,2448,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.420235294117646,1495.959712009804,0.03380966216751631,0.54103425389143,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +153,CrossAttention151-Attention_output-153,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +154,CrossAttention151-Attention_layernorm-154,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention151Attentionlayernorm154YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention151Attentionlayernorm154YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +155,BasicTransformerBlock-Attn_output_layernorm155,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +156,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +157,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +158,BasicTransformerBlock-FFN156Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +159,SpatialTransformer-Proj_out159,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout159einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout159einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +160,Time-Embed-MLP-Einsum160,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum160einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum160einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +161,Conv2d-GroupNorm161,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm161XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm161XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +162,Conv2d161Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d161Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d161Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +163,Conv2d-GroupNorm163,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm163XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm163XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +164,Conv2d163Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d163Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d163Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +165,SpatialTransformer-Input_GroupNorm165,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm165XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm165XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +166,SpatialTransformer-Proj_in166,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin166einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin166einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +167,BasicTransformerBlock-Input_layernorm167,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm167XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm167XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-Q-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Q168MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168Q168MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-K-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168K168MatMulK,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168K168MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-V-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168V168MatMulV,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168V168MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +169,SelfAttention168-FlashAttention-169,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention168FlashAttention169FlashAttention,MXU,1,Compute,1244,1244,883,0,0,0,0,0,0,0,0,1244,401,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention168FlashAttention169FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,201,1244,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.17232154340836,1962.5452170418007,0.03326609846707393,0.7097812719861847,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +170,SelfAttention168-Attention_output-170,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +171,SelfAttention168-Attention_layernorm-171,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention168Attentionlayernorm171YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention168Attentionlayernorm171YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +172,GatedSelfAttention-Linear172,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear172XLinearcontext,MXU,1,Memory,674,584,674,0,0,0,0,0,0,0,0,584,94,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear172XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,584,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,141,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,23.336261127596437,2761.97665522997,0.05116595758288819,0.9989065660867885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-Q-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Q173MatMulQ,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172Q173MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-K-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172K173MatMulK,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172K173MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-V-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172V173MatMulV,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172V173MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +174,GatedSelfAttention-Attn172-FlashAttention-174,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn172FlashAttention174FlashAttention,MXU,1,Compute,2750,2750,911,0,0,0,0,0,0,0,0,2750,663,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn172FlashAttention174FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,213,2750,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.299072,915.52734375,0.016003592276605298,0.33111296338155516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +175,GatedSelfAttention-Attn172-Attention_output-175,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +176,GatedSelfAttention-Attn172-Attention_layernorm-176,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,VPU,1,Memory,500,259,500,0,0,0,0,0,0,0,0,0,259,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,259,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.5176470588235293,0.9105606492992767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +177,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +178,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +179,GatedSelfAttention-FFN172Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +180,BasicTransformerBlock-Fuser_output_layernorm180,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-Q-181,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Q181MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention181Q181MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-K-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181K181MatMulK,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181K181MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-V-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181V181MatMulV,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181V181MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +182,CrossAttention181-FlashAttention-182,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention181FlashAttention182FlashAttention,MXU,1,Compute,2448,2448,1325,0,0,0,0,0,0,0,0,2448,803,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention181FlashAttention182FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,401,2448,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.420235294117646,1495.959712009804,0.03380966216751631,0.54103425389143,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +183,CrossAttention181-Attention_output-183,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +184,CrossAttention181-Attention_layernorm-184,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention181Attentionlayernorm184YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention181Attentionlayernorm184YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +185,BasicTransformerBlock-Attn_output_layernorm185,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +186,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +187,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +188,BasicTransformerBlock-FFN186Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +189,SpatialTransformer-Proj_out189,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout189einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout189einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +190,Downsample-Conv2d190Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d190Conv2dconv2d,MXU,1,Memory,10210,8490,10210,0,0,0,0,0,0,0,0,8490,1411,0,0,30310400,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,DownsampleConv2d190Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,24420352,900,1,1280,64,11520,0,8490,30310400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,184.86158667972575,2764.8148644098924,0.4053185748583738,0.9999330431862179,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +191,Time-Embed-MLP-Einsum191,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum191einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum191einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +192,Conv2d-GroupNorm192,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm192XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm192XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +193,Conv2d192Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d192Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d192Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +194,Conv2d-GroupNorm194,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm194XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm194XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +195,Conv2d194Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d194Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d194Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +196,Time-Embed-MLP-Einsum196,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum196einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum196einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +197,Conv2d-GroupNorm197,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm197XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm197XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +198,Conv2d197Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d197Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d197Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +199,Conv2d-GroupNorm199,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm199XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm199XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +200,Conv2d199Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d199Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d199Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +201,Time-Embed-MLP-Einsum201,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum201einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum201einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +202,Conv2d-GroupNorm202,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm202XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm202XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +203,Conv2d202Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d202Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d202Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +204,Conv2d-GroupNorm204,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm204XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm204XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +205,Conv2d204Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d204Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d204Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +206,SpatialTransformer-Input_GroupNorm206,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm206XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,SpatialTransformerInputGroupNorm206XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +207,SpatialTransformer-Proj_in207,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin207einsum,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjin207einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +208,BasicTransformerBlock-Input_layernorm208,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm208XnormLayerNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockInputlayernorm208XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-Q-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Q209MatMulQ,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209Q209MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-K-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209K209MatMulK,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209K209MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-V-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209V209MatMulV,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209V209MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +210,SelfAttention209-FlashAttention-210,"FlashAttention(q=1x64x8x160,k=1x64x8x160,v=1x64x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention209FlashAttention210FlashAttention,MXU,1,Memory,3753,340,3753,0,0,0,0,0,0,0,0,340,62,0,0,11141120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160]","[DT_BFLOAT16:(1,64,8,8)]",1179648,SelfAttention209FlashAttention210FlashAttention,FlashAttention,0,[],FlashAttention,,"[64, 64]",,11141120,32,8,64,64,160,12,340,11141120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,305,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3143213429256595,2764.7153110844656,0.0006891657755968553,0.9998970383668954,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +211,SelfAttention209-Attention_output-211,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +212,SelfAttention209-Attention_layernorm-212,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention209Attentionlayernorm212YnormLayerNormy,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,SelfAttention209Attentionlayernorm212YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +213,GatedSelfAttention-Linear213,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear213XLinearcontext,MXU,1,Memory,674,584,674,0,0,0,0,0,0,0,0,584,94,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear213XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,584,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,141,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,23.336261127596437,2761.97665522997,0.05116595758288819,0.9989065660867885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-Q-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Q214MatMulQ,MXU,1,Memory,1228,960,1228,0,0,0,0,0,0,0,0,960,157,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213Q214MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,960,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,192.12508143322475,2764.7235882787054,0.42124416218485305,0.999900031927199,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-K-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213K214MatMulK,MXU,1,Memory,1228,960,1228,0,0,0,0,0,0,0,0,960,157,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213K214MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,960,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,192.12508143322475,2764.7235882787054,0.42124416218485305,0.999900031927199,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-V-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213V214MatMulV,MXU,1,Memory,1228,960,1228,0,0,0,0,0,0,0,0,960,157,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213V214MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,2953216,100,1,72,1280,1280,0,960,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,192.12508143322475,2764.7235882787054,0.42124416218485305,0.999900031927199,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +215,GatedSelfAttention-Attn213-FlashAttention-215,"FlashAttention(q=1x72x8x160,k=1x72x8x160,v=1x72x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn213FlashAttention215FlashAttention,MXU,1,Memory,4719,340,4719,0,0,0,0,0,0,0,0,340,65,0,0,14008320,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160]","[DT_BFLOAT16:(1,72,8,8)]",1492992,GatedSelfAttentionAttn213FlashAttention215FlashAttention,FlashAttention,0,[],FlashAttention,,"[72, 72]",,14008320,32,8,72,72,160,15,340,14008320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.31637889383343926,2764.6248460346474,0.0006936770622119849,0.9998643204465271,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +216,GatedSelfAttention-Attn213-Attention_output-216,"XlaEinsum(a=1x72x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,MXU,1,Memory,1228,960,1228,0,0,0,0,0,0,0,0,960,157,0,0,3645440,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,72,1280)]",235929600,GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 72, 8, 160], [8, 160, 1280], [1, 72, 1280]]",1,2953216,100,1,72,1280,1280,0,960,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,241,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,192.12508143322475,2764.7235882787054,0.42124416218485305,0.999900031927199,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +217,GatedSelfAttention-Attn213-Attention_layernorm-217,"LayerNorm(x=1x72x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,VPU,1,Memory,500,71,500,0,0,0,0,0,0,0,0,0,71,0,0,368640,"DT_BFLOAT16:[1,72,1280]","[DT_BFLOAT16:(1,72,1280)]",737280,GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,71,368640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.47456,686.6455078125,0.1411764705882353,0.24833472253616637,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +218,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,4691,3784,4691,0,0,0,0,0,0,0,0,3784,627,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,3784,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,178.82344915796207,2764.8626525527607,0.39207964653866717,0.9999503264205283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +219,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,4691,3784,4691,0,0,0,0,0,0,0,0,3784,627,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,3784,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,178.82344915796207,2764.8626525527607,0.39207964653866717,0.9999503264205283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +220,GatedSelfAttention-FFN213Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,9,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.01568627450980392,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +221,BasicTransformerBlock-Fuser_output_layernorm221,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-Q-222,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Q222MatMulQ,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,CrossAttention222Q222MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-K-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222K222MatMulK,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222K222MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-V-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222V222MatMulV,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222V222MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +223,CrossAttention222-FlashAttention-223,"FlashAttention(q=1x64x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention222FlashAttention223FlashAttention,MXU,1,Memory,29249,1244,29249,0,0,0,0,0,0,0,0,1244,300,0,0,86835200,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,64,8,8)]",9437184,CrossAttention222FlashAttention223FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 64]",,86835200,128,8,512,64,160,100,1244,86835200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2143,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3226497999931622,2764.934939015009,0.0007074263477903512,0.9999764698065132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,CrossAttention222-Attention_output-224,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +225,CrossAttention222-Attention_layernorm-225,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention222Attentionlayernorm225YnormLayerNormy,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,CrossAttention222Attentionlayernorm225YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +226,BasicTransformerBlock-Attn_output_layernorm226,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +227,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,4691,3784,4691,0,0,0,0,0,0,0,0,3784,627,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,11272192,400,1,64,5120,1280,0,3784,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,178.82344915796207,2764.8626525527607,0.39207964653866717,0.9999503264205283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +228,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,4691,3784,4691,0,0,0,0,0,0,0,0,3784,627,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,2916352,400,1,64,1280,5120,0,3784,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,178.82344915796207,2764.8626525527607,0.39207964653866717,0.9999503264205283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +229,BasicTransformerBlock-FFN227Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,9,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.01568627450980392,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +230,SpatialTransformer-Proj_out230,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout230einsum,MXU,1,Memory,1215,960,1215,0,0,0,0,0,0,0,0,960,157,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjout230einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,2916352,100,1,64,1280,1280,0,960,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,172.60510288065845,2762.9083076131687,0.37844560121664356,0.9992435108908386,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +231,Time-Embed-MLP-Einsum231,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum231einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum231einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +232,Conv2d-GroupNorm232,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm232XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm232XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +233,Conv2d232Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d232Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d232Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +234,Conv2d-GroupNorm234,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm234XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm234XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +235,Conv2d234Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d234Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d234Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +236,Time-Embed-MLP-Einsum236,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum236einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum236einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +237,Conv2d-GroupNorm237,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm237XnormGroupNormX,VPU,1,Memory,500,126,500,0,0,0,0,0,0,0,0,0,126,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm237XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,126,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.25098039215686274,0.4414839511754069,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +238,Conv2d237Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d237Conv2dconv2d,MXU,1,Memory,20033,16960,20033,0,0,0,0,0,0,0,0,16960,2823,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d237Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,16960,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,188.43276593620524,2764.9081164516047,0.4131485697902457,0.9999667690602548,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +239,Conv2d-GroupNorm239,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm239XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm239XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +240,Conv2d239Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d239Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d239Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +241,SkipConnection-Einsum236,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum236einsum,MXU,1,Memory,2373,1902,2373,0,0,0,0,0,0,0,0,1902,314,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum236einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,1902,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,176.7511167298778,2764.9723122102823,0.38753595067696744,0.9999899863328326,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +242,Time-Embed-MLP-Einsum242,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum242einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum242einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +243,Conv2d-GroupNorm243,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm243XnormGroupNormX,VPU,1,Memory,500,126,500,0,0,0,0,0,0,0,0,0,126,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm243XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,126,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.25098039215686274,0.4414839511754069,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +244,Conv2d243Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d243Conv2dconv2d,MXU,1,Memory,20033,16960,20033,0,0,0,0,0,0,0,0,16960,2823,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d243Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,16960,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,188.43276593620524,2764.9081164516047,0.4131485697902457,0.9999667690602548,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +245,Conv2d-GroupNorm245,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm245XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm245XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +246,Conv2d245Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d245Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d245Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +247,SkipConnection-Einsum242,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum242einsum,MXU,1,Memory,2373,1902,2373,0,0,0,0,0,0,0,0,1902,314,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum242einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,1902,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,176.7511167298778,2764.9723122102823,0.38753595067696744,0.9999899863328326,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +248,Time-Embed-MLP-Einsum248,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum248einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum248einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +249,Conv2d-GroupNorm249,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm249XnormGroupNormX,VPU,1,Memory,500,126,500,0,0,0,0,0,0,0,0,0,126,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm249XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,126,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.25098039215686274,0.4414839511754069,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +250,Conv2d249Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d249Conv2dconv2d,MXU,1,Memory,20033,16960,20033,0,0,0,0,0,0,0,0,16960,2823,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d249Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,1800,1,1280,64,23040,0,16960,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,188.43276593620524,2764.9081164516047,0.4131485697902457,0.9999667690602548,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +251,Conv2d-GroupNorm251,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm251XnormGroupNormX,VPU,1,Memory,500,63,500,0,0,0,0,0,0,0,0,0,63,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm251XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,63,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.12549019607843137,0.22074197558770345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +252,Conv2d251Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d251Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d251Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +253,SkipConnection-Einsum248,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum248einsum,MXU,1,Memory,2373,1902,2373,0,0,0,0,0,0,0,0,1902,314,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum248einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,2916352,200,1,64,1280,2560,0,1902,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,176.7511167298778,2764.9723122102823,0.38753595067696744,0.9999899863328326,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample254,"Upsample(a=1x1280x8x8,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample254Upsample,VPU,1,Memory,500,0,500,0,0,0,0,0,0,0,0,0,0,0,0,819200,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,16,16)]",0,Upsample254Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,819200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1525.87890625,0.0,0.5518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample-Conv2d254Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d254Conv2dconv2d,MXU,1,Memory,10044,8490,10044,0,0,0,0,0,0,0,0,8490,1411,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,UpsampleConv2d254Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,23961600,900,1,1280,64,11520,0,8490,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,187.9168458781362,2764.933900214058,0.4120173884213458,0.9999760941099668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +255,Time-Embed-MLP-Einsum255,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum255einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum255einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +256,Conv2d-GroupNorm256,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm256XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm256XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +257,Conv2d256Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d256Conv2dconv2d,MXU,1,Compute,33902,33902,20530,0,0,0,0,0,0,0,0,33902,5647,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d256Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 16, 16], [2560, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,33902,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7009,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.38653766739424,1674.3170111645331,0.9765329831405809,0.6055396062077878,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +258,Conv2d-GroupNorm258,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm258XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm258XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +259,Conv2d258Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d258Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d258Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +260,SkipConnection-Einsum255,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum255einsum,MXU,1,Compute,3784,3784,2870,0,0,0,0,0,0,0,0,3784,627,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum255einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,3784,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,820,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,443.3725158562368,2096.8737612315012,0.9721171363175939,0.7583630239535267,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +261,SpatialTransformer-Input_GroupNorm261,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm261XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm261XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +262,SpatialTransformer-Proj_in262,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin262einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin262einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +263,BasicTransformerBlock-Input_layernorm263,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm263XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm263XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-Q-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Q264MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264Q264MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-K-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264K264MatMulK,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264K264MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-V-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264V264MatMulV,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264V264MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +265,SelfAttention264-FlashAttention-265,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention264FlashAttention265FlashAttention,MXU,1,Compute,1244,1244,883,0,0,0,0,0,0,0,0,1244,401,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention264FlashAttention265FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,201,1244,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.17232154340836,1962.5452170418007,0.03326609846707393,0.7097812719861847,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +266,SelfAttention264-Attention_output-266,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +267,SelfAttention264-Attention_layernorm-267,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention264Attentionlayernorm267YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention264Attentionlayernorm267YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +268,GatedSelfAttention-Linear268,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear268XLinearcontext,MXU,1,Memory,674,584,674,0,0,0,0,0,0,0,0,584,94,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear268XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,584,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,141,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,23.336261127596437,2761.97665522997,0.05116595758288819,0.9989065660867885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-Q-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Q269MatMulQ,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268Q269MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-K-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268K269MatMulK,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268K269MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-V-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268V269MatMulV,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268V269MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +270,GatedSelfAttention-Attn268-FlashAttention-270,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn268FlashAttention270FlashAttention,MXU,1,Compute,2750,2750,911,0,0,0,0,0,0,0,0,2750,663,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn268FlashAttention270FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,213,2750,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.299072,915.52734375,0.016003592276605298,0.33111296338155516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +271,GatedSelfAttention-Attn268-Attention_output-271,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +272,GatedSelfAttention-Attn268-Attention_layernorm-272,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,VPU,1,Memory,500,259,500,0,0,0,0,0,0,0,0,0,259,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,259,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.5176470588235293,0.9105606492992767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +273,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +274,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +275,GatedSelfAttention-FFN268Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +276,BasicTransformerBlock-Fuser_output_layernorm276,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-Q-277,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Q277MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention277Q277MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-K-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277K277MatMulK,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277K277MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-V-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277V277MatMulV,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277V277MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +278,CrossAttention277-FlashAttention-278,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention277FlashAttention278FlashAttention,MXU,1,Compute,2448,2448,1325,0,0,0,0,0,0,0,0,2448,803,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention277FlashAttention278FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,401,2448,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.420235294117646,1495.959712009804,0.03380966216751631,0.54103425389143,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +279,CrossAttention277-Attention_output-279,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +280,CrossAttention277-Attention_layernorm-280,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention277Attentionlayernorm280YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention277Attentionlayernorm280YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +281,BasicTransformerBlock-Attn_output_layernorm281,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +282,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +283,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +284,BasicTransformerBlock-FFN282Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +285,SpatialTransformer-Proj_out285,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout285einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout285einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +286,Time-Embed-MLP-Einsum286,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum286einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum286einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +287,Conv2d-GroupNorm287,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm287XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm287XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +288,Conv2d287Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d287Conv2dconv2d,MXU,1,Compute,33902,33902,20530,0,0,0,0,0,0,0,0,33902,5647,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d287Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 16, 16], [2560, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,3600,1,1280,256,23040,0,33902,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7009,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.38653766739424,1674.3170111645331,0.9765329831405809,0.6055396062077878,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +289,Conv2d-GroupNorm289,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm289XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm289XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +290,Conv2d289Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d289Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d289Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +291,SkipConnection-Einsum286,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum286einsum,MXU,1,Compute,3784,3784,2870,0,0,0,0,0,0,0,0,3784,627,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum286einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,3801088,400,1,256,1280,2560,0,3784,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,820,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,443.3725158562368,2096.8737612315012,0.9721171363175939,0.7583630239535267,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +292,SpatialTransformer-Input_GroupNorm292,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm292XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm292XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +293,SpatialTransformer-Proj_in293,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin293einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin293einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +294,BasicTransformerBlock-Input_layernorm294,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm294XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm294XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-Q-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Q295MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295Q295MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-K-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295K295MatMulK,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295K295MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-V-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295V295MatMulV,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295V295MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +296,SelfAttention295-FlashAttention-296,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention295FlashAttention296FlashAttention,MXU,1,Compute,1244,1244,883,0,0,0,0,0,0,0,0,1244,401,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention295FlashAttention296FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,201,1244,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.17232154340836,1962.5452170418007,0.03326609846707393,0.7097812719861847,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +297,SelfAttention295-Attention_output-297,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +298,SelfAttention295-Attention_layernorm-298,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention295Attentionlayernorm298YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention295Attentionlayernorm298YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +299,GatedSelfAttention-Linear299,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear299XLinearcontext,MXU,1,Memory,674,584,674,0,0,0,0,0,0,0,0,584,94,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear299XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,584,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,141,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,23.336261127596437,2761.97665522997,0.05116595758288819,0.9989065660867885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-Q-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Q300MatMulQ,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299Q300MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-K-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299K300MatMulK,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299K300MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-V-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299V300MatMulV,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299V300MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +301,GatedSelfAttention-Attn299-FlashAttention-301,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn299FlashAttention301FlashAttention,MXU,1,Compute,2750,2750,911,0,0,0,0,0,0,0,0,2750,663,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn299FlashAttention301FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,213,2750,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.299072,915.52734375,0.016003592276605298,0.33111296338155516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +302,GatedSelfAttention-Attn299-Attention_output-302,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +303,GatedSelfAttention-Attn299-Attention_layernorm-303,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,VPU,1,Memory,500,259,500,0,0,0,0,0,0,0,0,0,259,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,259,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.5176470588235293,0.9105606492992767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +304,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +305,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +306,GatedSelfAttention-FFN299Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +307,BasicTransformerBlock-Fuser_output_layernorm307,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-Q-308,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Q308MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention308Q308MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-K-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308K308MatMulK,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308K308MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-V-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308V308MatMulV,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308V308MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +309,CrossAttention308-FlashAttention-309,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention308FlashAttention309FlashAttention,MXU,1,Compute,2448,2448,1325,0,0,0,0,0,0,0,0,2448,803,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention308FlashAttention309FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,401,2448,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.420235294117646,1495.959712009804,0.03380966216751631,0.54103425389143,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +310,CrossAttention308-Attention_output-310,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +311,CrossAttention308-Attention_layernorm-311,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention308Attentionlayernorm311YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention308Attentionlayernorm311YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +312,BasicTransformerBlock-Attn_output_layernorm312,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +313,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +314,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +315,BasicTransformerBlock-FFN313Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +316,SpatialTransformer-Proj_out316,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout316einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout316einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +317,Time-Embed-MLP-Einsum317,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum317einsum,MXU,1,Memory,1106,960,1106,0,0,0,0,0,0,0,0,960,157,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum317einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,2626048,100,1,1,1280,1280,0,960,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9627486437613015,2763.5860615569904,0.006495979394753359,0.9994886298578627,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +318,Conv2d-GroupNorm318,"GroupNorm(x=1x1920x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm318XnormGroupNormX,VPU,1,Memory,663,377,663,0,0,0,0,0,0,0,0,0,377,0,0,1966080,"DT_BFLOAT16:[1,1920,16,16]","[DT_BFLOAT16:(1,1920,16,16)]",3932160,Conv2dGroupNorm318XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,377,1966080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,138,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.930859728506787,2761.7717760180994,0.5678289415313637,0.99883246872264,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +319,Conv2d318Conv2d,"Conv2D(a=1x1920x16x16,b=1920x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d318Conv2dconv2d,MXU,1,Compute,25431,25431,15452,0,0,0,0,0,0,0,0,25431,4235,0,0,45875200,"DT_BFLOAT16:[1,1920,16,16],DT_BFLOAT16:[1920,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",11324620800,Conv2d318Conv2dconv2d,Conv2D,44236800,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 16, 16], [1920, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,2700,1,1280,256,17280,0,25431,45875200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.307726790138,1680.0208161299201,0.9763601862224836,0.6076024651464449,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +320,Conv2d-GroupNorm320,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm320XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm320XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +321,Conv2d320Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d320Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d320Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +322,SkipConnection-Einsum317,"XlaEinsum(a=1x16x16x1920,b=1920x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum317einsum,MXU,1,Compute,2843,2843,2208,0,0,0,0,0,0,0,0,2843,470,0,0,6553600,"DT_BFLOAT16:[1,16,16,1920],DT_BFLOAT16:[1920,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1258291200,SkipConnectionEinsum317einsum,Einsum,4915200,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 1920], [1920, 1280], [1, 16, 16, 1280]]",1,3801088,300,1,256,1280,1920,0,2843,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,619,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,442.59275413295813,2146.857412944073,0.9704074684732082,0.776440294012323,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +323,SpatialTransformer-Input_GroupNorm323,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm323XnormGroupNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm323XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +324,SpatialTransformer-Proj_in324,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin324einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin324einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +325,BasicTransformerBlock-Input_layernorm325,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm325XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm325XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-Q-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Q326MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326Q326MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-K-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326K326MatMulK,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326K326MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-V-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326V326MatMulV,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326V326MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +327,SelfAttention326-FlashAttention-327,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention326FlashAttention327FlashAttention,MXU,1,Compute,1244,1244,883,0,0,0,0,0,0,0,0,1244,401,0,0,2621440,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention326FlashAttention327FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 128]",,335872,128,8,256,256,160,201,1244,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.17232154340836,1962.5452170418007,0.03326609846707393,0.7097812719861847,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +328,SelfAttention326-Attention_output-328,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +329,SelfAttention326-Attention_layernorm-329,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention326Attentionlayernorm329YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention326Attentionlayernorm329YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +330,GatedSelfAttention-Linear330,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear330XLinearcontext,MXU,1,Memory,674,584,674,0,0,0,0,0,0,0,0,584,94,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear330XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,60,1,8,1280,768,0,584,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,141,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,23.336261127596437,2761.97665522997,0.05116595758288819,0.9989065660867885,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-Q-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Q331MatMulQ,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330Q331MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-K-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330K331MatMulK,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330K331MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-V-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330V331MatMulV,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330V331MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +332,GatedSelfAttention-Attn330-FlashAttention-332,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn330FlashAttention332FlashAttention,MXU,1,Compute,2750,2750,911,0,0,0,0,0,0,0,0,2750,663,0,0,2703360,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn330FlashAttention332FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 128]",,345088,288,8,264,264,160,213,2750,2703360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.299072,915.52734375,0.016003592276605298,0.33111296338155516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +333,GatedSelfAttention-Attn330-Attention_output-333,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,MXU,1,Compute,2843,2843,1559,0,0,0,0,0,0,0,0,2843,470,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,3837952,300,1,264,1280,1280,0,2843,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,304.28251846640876,1516.2180478917517,0.6671551345753308,0.5483609576462032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +334,GatedSelfAttention-Attn330-Attention_layernorm-334,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,VPU,1,Memory,500,259,500,0,0,0,0,0,0,0,0,0,259,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,259,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.5176470588235293,0.9105606492992767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +335,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +336,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +337,GatedSelfAttention-FFN330Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +338,BasicTransformerBlock-Fuser_output_layernorm338,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-Q-339,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Q339MatMulQ,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention339Q339MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-K-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339K339MatMulK,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339K339MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-V-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339V339MatMulV,MXU,1,Compute,2278,2278,1369,0,0,0,0,0,0,0,0,2278,376,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339V339MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,240,1,512,1280,768,0,2278,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,470,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.8933099209833,1661.1851130377524,0.9688739009198705,0.6007902759630208,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +340,CrossAttention339-FlashAttention-340,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention339FlashAttention340FlashAttention,MXU,1,Compute,2448,2448,1325,0,0,0,0,0,0,0,0,2448,803,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention339FlashAttention340FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,630784,256,8,512,256,160,401,2448,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.420235294117646,1495.959712009804,0.03380966216751631,0.54103425389143,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +341,CrossAttention339-Attention_output-341,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +342,CrossAttention339-Attention_layernorm-342,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention339Attentionlayernorm342YnormLayerNormy,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention339Attentionlayernorm342YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +343,BasicTransformerBlock-Attn_output_layernorm343,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,VPU,1,Memory,500,252,500,0,0,0,0,0,0,0,0,0,252,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,252,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.5019607843137255,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +344,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,13631488,800,1,256,5120,1280,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +345,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,5519,0,0,0,0,0,0,0,0,7549,1255,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,3801088,800,1,256,1280,5120,0,7549,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1623,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,2021.2993856802225,0.9745638478807193,0.7310305192333535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +346,BasicTransformerBlock-FFN344Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,32,500,0,0,0,0,0,0,0,0,0,32,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,32,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,41,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.06274509803921569,0.8829679023508138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +347,SpatialTransformer-Proj_out347,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout347einsum,MXU,1,Compute,1902,1902,1546,0,0,0,0,0,0,0,0,1902,314,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout347einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,3801088,200,1,256,1280,1280,0,1902,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,2246.299125920084,0.9670061103642942,0.812404747168204,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample348,"Upsample(a=1x1280x16x16,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample348Upsample,VPU,1,Memory,1104,0,1104,0,0,0,0,0,0,0,0,0,0,0,0,3276800,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,32,32)]",0,Upsample348Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,3276800,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,2764.273380887681,0.0,0.9997372082776423,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample-Conv2d348Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d348Conv2dconv2d,MXU,1,Compute,16960,16960,10375,0,0,0,0,0,0,0,0,16960,2823,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,UpsampleConv2d348Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,24911872,1800,1,1280,256,11520,0,16960,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3513,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,1691.4223724941037,0.9760147757792447,0.6117259936687536,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +349,Time-Embed-MLP-Einsum349,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum349einsum,MXU,1,Memory,554,490,554,0,0,0,0,0,0,0,0,490,78,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum349einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,490,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9574007220216605,2760.7494312933636,0.006484253800177993,0.9984627237950682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +350,Conv2d-GroupNorm350,"GroupNorm(x=1x1920x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm350XnormGroupNormX,VPU,1,Memory,2649,1506,2649,0,0,0,0,0,0,0,0,0,1506,0,0,7864320,"DT_BFLOAT16:[1,1920,32,32]","[DT_BFLOAT16:(1,1920,32,32)]",15728640,Conv2dGroupNorm350XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1506,7864320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +351,Conv2d350Conv2d,"Conv2D(a=1x1920x32x32,b=1920x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d350Conv2dconv2d,MXU,1,Compute,50843,50843,9216,0,0,0,0,0,0,0,0,50843,8470,0,0,27361280,"DT_BFLOAT16:[1,1920,32,32],DT_BFLOAT16:[1920,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",22649241600,Conv2d350Conv2dconv2d,Conv2D,22118400,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 32, 32], [1920, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,5400,1,640,1024,17280,0,50843,27361280,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9083,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.47413803276754,501.19343340036977,0.976725051465255,0.18126344788440135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +352,Conv2d-GroupNorm352,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm352XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm352XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +353,Conv2d352Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d352Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d352Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +354,SkipConnection-Einsum349,"XlaEinsum(a=1x32x32x1920,b=1920x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum349einsum,MXU,1,Compute,5666,5666,2594,0,0,0,0,0,0,0,0,5666,941,0,0,7700480,"DT_BFLOAT16:[1,32,32,1920],DT_BFLOAT16:[1920,640]","[DT_BFLOAT16:(1,32,32,640)]",2516582400,SkipConnectionEinsum349einsum,Einsum,2457600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1920], [1920, 640], [1, 32, 32, 640]]",1,4718592,600,1,1024,640,1920,0,5666,7700480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.1550300035298,1265.7308258692199,0.9738328389937632,0.45776883394908496,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +355,SpatialTransformer-Input_GroupNorm355,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm355XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm355XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +356,SpatialTransformer-Proj_in356,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin356einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin356einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +357,BasicTransformerBlock-Input_layernorm357,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm357XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm357XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-Q-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Q358MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358Q358MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-K-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358K358MatMulK,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358K358MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-V-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358V358MatMulV,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358V358MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +359,SelfAttention358-FlashAttention-359,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention358FlashAttention359FlashAttention,MXU,1,Compute,9676,9676,1766,0,0,0,0,0,0,0,0,9676,4818,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention358FlashAttention359FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,3212,9676,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1729,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.210199255890863,504.6313042579578,0.06842997353127733,0.18250680081662127,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +360,SelfAttention358-Attention_output-360,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +361,SelfAttention358-Attention_layernorm-361,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention358Attentionlayernorm361YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention358Attentionlayernorm361YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +362,GatedSelfAttention-Linear362,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear362XLinearcontext,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear362XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,302,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.03448585541086664,0.677401937584765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-Q-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Q363MatMulQ,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362Q363MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-K-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362K363MatMulK,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362K363MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-V-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362V363MatMulV,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362V363MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +364,GatedSelfAttention-Attn362-FlashAttention-364,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn362FlashAttention364FlashAttention,MXU,1,Compute,12236,12236,1780,0,0,0,0,0,0,0,0,12236,5294,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn362FlashAttention364FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,3262,12236,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.067580254985288,402.1706009035837,0.054961964173235456,0.14545048857272466,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +365,GatedSelfAttention-Attn362-Attention_output-365,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +366,GatedSelfAttention-Attn362-Attention_layernorm-366,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,VPU,1,Memory,890,506,890,0,0,0,0,0,0,0,0,0,506,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,506,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.936898876404494,2764.5839734023875,0.5684071381361533,0.9998495383010443,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +367,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +368,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +369,GatedSelfAttention-FFN362Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +370,BasicTransformerBlock-Fuser_output_layernorm370,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-Q-371,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Q371MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention371Q371MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-K-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371K371MatMulK,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371K371MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-V-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371V371MatMulV,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371V371MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +372,CrossAttention371-FlashAttention-372,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention371FlashAttention372FlashAttention,MXU,1,Compute,4858,4858,1325,0,0,0,0,0,0,0,0,4858,2408,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention371FlashAttention372FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,1606,4858,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.08170934540963,753.8306659118979,0.0681482527674598,0.2726331522285345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +373,CrossAttention371-Attention_output-373,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +374,CrossAttention371-Attention_layernorm-374,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention371Attentionlayernorm374YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention371Attentionlayernorm374YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +375,BasicTransformerBlock-Attn_output_layernorm375,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +376,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +377,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +378,BasicTransformerBlock-FFN376Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +379,SpatialTransformer-Proj_out379,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout379einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout379einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +380,Time-Embed-MLP-Einsum380,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum380einsum,MXU,1,Memory,554,490,554,0,0,0,0,0,0,0,0,490,78,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum380einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,490,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9574007220216605,2760.7494312933636,0.006484253800177993,0.9984627237950682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +381,Conv2d-GroupNorm381,"GroupNorm(x=1x1280x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm381XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,1280,32,32]","[DT_BFLOAT16:(1,1280,32,32)]",10485760,Conv2dGroupNorm381XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +382,Conv2d381Conv2d,"Conv2D(a=1x1280x32x32,b=1280x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d381Conv2dconv2d,MXU,1,Compute,33902,33902,6292,0,0,0,0,0,0,0,0,33902,5647,0,0,18677760,"DT_BFLOAT16:[1,1280,32,32],DT_BFLOAT16:[1280,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",15099494400,Conv2d381Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 32, 32], [1280, 640, 3, 3], [1, 640, 32, 32]]",1,15474688,3600,1,640,1024,11520,0,33902,18677760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6066,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.38653766739424,513.0971485826794,0.9765329831405809,0.18556858899916073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +383,Conv2d-GroupNorm383,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm383XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm383XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +384,Conv2d383Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d383Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d383Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +385,SkipConnection-Einsum380,"XlaEinsum(a=1x32x32x1280,b=1280x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum380einsum,MXU,1,Compute,3784,3784,1877,0,0,0,0,0,0,0,0,3784,627,0,0,5570560,"DT_BFLOAT16:[1,32,32,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,32,32,640)]",1677721600,SkipConnectionEinsum380einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1280], [1280, 640], [1, 32, 32, 640]]",1,4718592,400,1,1024,640,1280,0,3784,5570560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,754,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,443.3725158562368,1371.0328438821352,0.9721171363175939,0.49585274643115196,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +386,SpatialTransformer-Input_GroupNorm386,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm386XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm386XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +387,SpatialTransformer-Proj_in387,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin387einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin387einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +388,BasicTransformerBlock-Input_layernorm388,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm388XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm388XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-Q-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Q389MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389Q389MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-K-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389K389MatMulK,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389K389MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-V-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389V389MatMulV,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389V389MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +390,SelfAttention389-FlashAttention-390,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention389FlashAttention390FlashAttention,MXU,1,Compute,9676,9676,1766,0,0,0,0,0,0,0,0,9676,4818,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention389FlashAttention390FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,3212,9676,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1729,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.210199255890863,504.6313042579578,0.06842997353127733,0.18250680081662127,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +391,SelfAttention389-Attention_output-391,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +392,SelfAttention389-Attention_layernorm-392,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention389Attentionlayernorm392YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention389Attentionlayernorm392YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +393,GatedSelfAttention-Linear393,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear393XLinearcontext,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear393XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,302,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.03448585541086664,0.677401937584765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-Q-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Q394MatMulQ,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393Q394MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-K-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393K394MatMulK,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393K394MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-V-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393V394MatMulV,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393V394MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +395,GatedSelfAttention-Attn393-FlashAttention-395,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn393FlashAttention395FlashAttention,MXU,1,Compute,12236,12236,1780,0,0,0,0,0,0,0,0,12236,5294,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn393FlashAttention395FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,3262,12236,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.067580254985288,402.1706009035837,0.054961964173235456,0.14545048857272466,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +396,GatedSelfAttention-Attn393-Attention_output-396,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +397,GatedSelfAttention-Attn393-Attention_layernorm-397,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,VPU,1,Memory,890,506,890,0,0,0,0,0,0,0,0,0,506,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,506,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.936898876404494,2764.5839734023875,0.5684071381361533,0.9998495383010443,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +398,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +399,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +400,GatedSelfAttention-FFN393Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +401,BasicTransformerBlock-Fuser_output_layernorm401,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-Q-402,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Q402MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention402Q402MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-K-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402K402MatMulK,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402K402MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-V-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402V402MatMulV,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402V402MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +403,CrossAttention402-FlashAttention-403,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention402FlashAttention403FlashAttention,MXU,1,Compute,4858,4858,1325,0,0,0,0,0,0,0,0,4858,2408,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention402FlashAttention403FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,1606,4858,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.08170934540963,753.8306659118979,0.0681482527674598,0.2726331522285345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +404,CrossAttention402-Attention_output-404,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +405,CrossAttention402-Attention_layernorm-405,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention402Attentionlayernorm405YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention402Attentionlayernorm405YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +406,BasicTransformerBlock-Attn_output_layernorm406,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +407,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +408,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +409,BasicTransformerBlock-FFN407Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +410,SpatialTransformer-Proj_out410,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout410einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout410einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +411,Time-Embed-MLP-Einsum411,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum411einsum,MXU,1,Memory,554,490,554,0,0,0,0,0,0,0,0,490,78,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum411einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1314048,50,1,1,640,1280,0,490,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9574007220216605,2760.7494312933636,0.006484253800177993,0.9984627237950682,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +412,Conv2d-GroupNorm412,"GroupNorm(x=1x960x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm412XnormGroupNormX,VPU,1,Memory,1325,753,1325,0,0,0,0,0,0,0,0,0,753,0,0,3932160,"DT_BFLOAT16:[1,960,32,32]","[DT_BFLOAT16:(1,960,32,32)]",7864320,Conv2dGroupNorm412XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,753,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,275,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.935335849056604,2763.8561320754716,0.5682574916759157,0.9995863045480909,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +413,Conv2d412Conv2d,"Conv2D(a=1x960x32x32,b=960x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d412Conv2dconv2d,MXU,1,Compute,25619,25619,4829,0,0,0,0,0,0,0,0,25619,4267,0,0,14336000,"DT_BFLOAT16:[1,960,32,32],DT_BFLOAT16:[960,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",11324620800,Conv2d412Conv2dconv2d,Conv2D,11059200,[],Conv2D,bf01;io01->bf01,"[[1, 960, 32, 32], [960, 640, 3, 3], [1, 640, 32, 32]]",1,14589440,2720,1,640,1024,8640,0,25619,14336000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4589,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,442.03992349428154,521.1538479131699,0.9691953587503016,0.18848240430856053,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +414,Conv2d-GroupNorm414,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm414XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm414XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +415,Conv2d414Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d414Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d414Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +416,SkipConnection-Einsum411,"XlaEinsum(a=1x32x32x960,b=960x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum411einsum,MXU,1,Compute,3031,3031,1518,0,0,0,0,0,0,0,0,3031,502,0,0,4505600,"DT_BFLOAT16:[1,32,32,960],DT_BFLOAT16:[960,640]","[DT_BFLOAT16:(1,32,32,640)]",1258291200,SkipConnectionEinsum411einsum,Einsum,1228800,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 960], [960, 640], [1, 32, 32, 640]]",1,4505600,320,1,1024,640,960,0,3031,4505600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,605,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,415.14061365885846,1384.416691582811,0.9102172328833161,0.500693197679136,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +417,SpatialTransformer-Input_GroupNorm417,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm417XnormGroupNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm417XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +418,SpatialTransformer-Proj_in418,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin418einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin418einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +419,BasicTransformerBlock-Input_layernorm419,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm419XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm419XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-Q-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Q420MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420Q420MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-K-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420K420MatMulK,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420K420MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-V-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420V420MatMulV,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420V420MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +421,SelfAttention420-FlashAttention-421,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention420FlashAttention421FlashAttention,MXU,1,Compute,9676,9676,1766,0,0,0,0,0,0,0,0,9676,4818,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention420FlashAttention421FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 128]",,872448,1024,8,1024,1024,80,3212,9676,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1729,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.210199255890863,504.6313042579578,0.06842997353127733,0.18250680081662127,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +422,SelfAttention420-Attention_output-422,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +423,SelfAttention420-Attention_layernorm-423,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention420Attentionlayernorm423YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention420Attentionlayernorm423YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +424,GatedSelfAttention-Linear424,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear424XLinearcontext,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear424XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,30,1,8,640,768,0,302,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.03448585541086664,0.677401937584765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-Q-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Q425MatMulQ,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424Q425MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-K-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424K425MatMulK,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424K425MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-V-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424V425MatMulV,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424V425MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +426,GatedSelfAttention-Attn424-FlashAttention-426,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn424FlashAttention426FlashAttention,MXU,1,Compute,12236,12236,1780,0,0,0,0,0,0,0,0,12236,5294,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn424FlashAttention426FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 128]",,879104,1296,8,1032,1032,80,3262,12236,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,25.067580254985288,402.1706009035837,0.054961964173235456,0.14545048857272466,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +427,GatedSelfAttention-Attn424-Attention_output-427,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,MXU,1,Compute,2137,2137,1166,0,0,0,0,0,0,0,0,2137,352,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,225,1,1032,640,640,0,2137,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,395.6080486663547,1508.385207979937,0.8673910754955928,0.5455281041518759,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +428,GatedSelfAttention-Attn424-Attention_layernorm-428,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,VPU,1,Memory,890,506,890,0,0,0,0,0,0,0,0,0,506,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,506,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.936898876404494,2764.5839734023875,0.5684071381361533,0.9998495383010443,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +429,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +430,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +431,GatedSelfAttention-FFN424Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +432,BasicTransformerBlock-Fuser_output_layernorm432,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-Q-433,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Q433MatMulQ,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention433Q433MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-K-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433K433MatMulK,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433K433MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-V-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433V433MatMulV,MXU,1,Compute,1149,1149,817,0,0,0,0,0,0,0,0,1149,188,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433V433MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,120,1,512,640,768,0,1149,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,245,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,438.04741514360313,1965.448895778938,0.9604415780223955,0.7108314270448239,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +434,CrossAttention433-FlashAttention-434,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention433FlashAttention434FlashAttention,MXU,1,Compute,4858,4858,1325,0,0,0,0,0,0,0,0,4858,2408,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention433FlashAttention434FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,446464,512,8,512,1024,80,1606,4858,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.08170934540963,753.8306659118979,0.0681482527674598,0.2726331522285345,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +435,CrossAttention433-Attention_output-435,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +436,CrossAttention433-Attention_layernorm-436,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention433Attentionlayernorm436YnormLayerNormy,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention433Attentionlayernorm436YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +437,BasicTransformerBlock-Attn_output_layernorm437,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,VPU,1,Memory,883,503,883,0,0,0,0,0,0,0,0,0,503,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,503,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,184,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +438,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,800,1,1024,2560,640,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +439,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,7549,7549,3312,0,0,0,0,0,0,0,0,7549,1255,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,4718592,800,1,1024,640,2560,0,7549,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,444.48843555437804,1212.7796314081336,0.9745638478807193,0.43861831154001213,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +440,BasicTransformerBlock-FFN438Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,883,63,883,0,0,0,0,0,0,0,0,0,63,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,63,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +441,SpatialTransformer-Proj_out441,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout441einsum,MXU,1,Compute,1902,1902,1159,0,0,0,0,0,0,0,0,1902,314,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout441einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,200,1,1024,640,640,0,1902,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,393,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,441.04143007360676,1684.724344440063,0.9670061103642942,0.609303560376153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample442,"Upsample(a=1x640x32x32,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample442Upsample,VPU,1,Memory,2208,0,2208,0,0,0,0,0,0,0,0,0,0,0,0,6553600,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,64,64)]",0,Upsample442Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,2764.273380887681,0.0,0.9997372082776423,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample-Conv2d442Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d442Conv2dconv2d,MXU,1,Compute,16960,16960,3367,0,0,0,0,0,0,0,0,16960,2823,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,UpsampleConv2d442Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,1800,1,640,1024,5760,0,16960,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.15018867924533,548.8125783092571,0.9760147757792447,0.19848556177549986,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +443,Time-Embed-MLP-Einsum443,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum443einsum,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum443einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,302,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0035922766052986083,0.5540106223246073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +444,Conv2d-GroupNorm444,"GroupNorm(x=1x960x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm444XnormGroupNormX,VPU,1,Memory,5298,3012,5298,0,0,0,0,0,0,0,0,0,3012,0,0,15728640,"DT_BFLOAT16:[1,960,64,64]","[DT_BFLOAT16:(1,960,64,64)]",31457280,Conv2dGroupNorm444XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,3012,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +445,Conv2d444Conv2d,"Conv2D(a=1x960x64x64,b=960x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d444Conv2dconv2d,MXU,1,Compute,61459,61459,5395,0,0,0,0,0,0,0,0,61459,10240,0,0,16015360,"DT_BFLOAT16:[1,960,64,64],DT_BFLOAT16:[960,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",22649241600,Conv2d444Conv2dconv2d,Conv2D,5529600,[],Conv2D,bf01;io01->bf01,"[[1, 960, 64, 64], [960, 320, 3, 3], [1, 320, 64, 64]]",1,16514560,6528,1,320,4096,8640,0,61459,16015360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10600,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,368.5260352430076,242.68970059053598,0.8080123625774576,0.08777204361321374,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +446,Conv2d-GroupNorm446,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm446XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm446XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +447,Conv2d446Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d446Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d446Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,SkipConnection-Einsum443,"XlaEinsum(a=1x64x64x960,b=960x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum443einsum,MXU,1,Compute,7248,7248,3739,0,0,0,0,0,0,0,0,7248,1204,0,0,11100160,"DT_BFLOAT16:[1,64,64,960],DT_BFLOAT16:[960,320]","[DT_BFLOAT16:(1,64,64,320)]",2516582400,SkipConnectionEinsum443einsum,Einsum,614400,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 960], [960, 320], [1, 64, 64, 320]]",1,11100160,768,1,4096,320,960,0,7248,11100160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,347.21059602649007,1426.3009919762349,0.7612771613877846,0.5158412267545153,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +449,SpatialTransformer-Input_GroupNorm449,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm449XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm449XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +450,SpatialTransformer-Proj_in450,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin450einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin450einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +451,BasicTransformerBlock-Input_layernorm451,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm451XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm451XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-Q-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Q452MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452Q452MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-K-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452K452MatMulK,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452K452MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-V-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452V452MatMulV,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452V452MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +453,SelfAttention452-FlashAttention-453,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention452FlashAttention453FlashAttention,MXU,1,Compute,154240,154240,3532,0,0,0,0,0,0,0,0,154240,77101,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention452FlashAttention453FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,51401,154240,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.32675186721992,63.31447743775934,0.0686855211502738,0.022898545185446414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +454,SelfAttention452-Attention_output-454,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +455,SelfAttention452-Attention_layernorm-455,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention452Attentionlayernorm455YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention452Attentionlayernorm455YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +456,GatedSelfAttention-Linear456,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear456XLinearcontext,MXU,1,Memory,500,47,500,0,0,0,0,0,0,0,0,0,47,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear456XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,47,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.01724292770543332,0.3428398808346519,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-Q-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Q457MatMulQ,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456Q457MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-K-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456K457MatMulK,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456K457MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-V-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456V457MatMulV,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456V457MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +458,GatedSelfAttention-Attn456-FlashAttention-458,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn456FlashAttention458FlashAttention,MXU,1,Compute,164030,164030,3539,0,0,0,0,0,0,0,0,164030,78931,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn456FlashAttention458FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,51601,164030,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,29.57221854538804,59.65188371839374,0.06483861623985296,0.021573918162167718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +459,GatedSelfAttention-Attn456-Attention_output-459,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +460,GatedSelfAttention-Attn456-Attention_layernorm-460,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,VPU,1,Memory,1770,1006,1770,0,0,0,0,0,0,0,0,0,1006,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1006,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.935728813559322,2764.039120431674,0.5682951146560319,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +461,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +462,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +463,GatedSelfAttention-FFN456Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +464,BasicTransformerBlock-Fuser_output_layernorm464,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-Q-465,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Q465MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention465Q465MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-K-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465K465MatMulK,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465K465MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-V-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465V465MatMulV,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465V465MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +466,CrossAttention465-FlashAttention-466,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention465FlashAttention466FlashAttention,MXU,1,Compute,19314,19314,1987,0,0,0,0,0,0,0,0,19314,9637,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention465FlashAttention466FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,6425,19314,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.271604846225536,284.4135892357875,0.06856460845900791,0.10286205758979658,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +467,CrossAttention465-Attention_output-467,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +468,CrossAttention465-Attention_layernorm-468,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention465Attentionlayernorm468YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention465Attentionlayernorm468YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +469,BasicTransformerBlock-Attn_output_layernorm469,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +470,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +471,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +472,BasicTransformerBlock-FFN470Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +473,SpatialTransformer-Proj_out473,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout473einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout473einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +474,Time-Embed-MLP-Einsum474,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum474einsum,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum474einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,302,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0035922766052986083,0.5540106223246073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +475,Conv2d-GroupNorm475,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm475XnormGroupNormX,VPU,1,Memory,3532,2009,3532,0,0,0,0,0,0,0,0,0,2009,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm475XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2009,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,736,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +476,Conv2d475Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d475Conv2dconv2d,MXU,1,Compute,40678,40678,3891,0,0,0,0,0,0,0,0,40678,6776,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d475Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,40678,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7037,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,371.19559467033775,264.4536675613968,0.8138655094751948,0.09564327940737678,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +477,Conv2d-GroupNorm477,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm477XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm477XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +478,Conv2d477Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d477Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d477Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +479,SkipConnection-Einsum474,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum474einsum,MXU,1,Compute,4537,4537,2787,0,0,0,0,0,0,0,0,4537,752,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum474einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,4537,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,369.78655499228563,1698.4105083893542,0.8107761172196991,0.6142533484229129,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +480,SpatialTransformer-Input_GroupNorm480,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm480XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm480XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +481,SpatialTransformer-Proj_in481,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin481einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin481einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +482,BasicTransformerBlock-Input_layernorm482,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm482XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm482XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-Q-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Q483MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483Q483MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-K-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483K483MatMulK,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483K483MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-V-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483V483MatMulV,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483V483MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +484,SelfAttention483-FlashAttention-484,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention483FlashAttention484FlashAttention,MXU,1,Compute,154240,154240,3532,0,0,0,0,0,0,0,0,154240,77101,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention483FlashAttention484FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,51401,154240,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.32675186721992,63.31447743775934,0.0686855211502738,0.022898545185446414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +485,SelfAttention483-Attention_output-485,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +486,SelfAttention483-Attention_layernorm-486,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention483Attentionlayernorm486YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention483Attentionlayernorm486YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +487,GatedSelfAttention-Linear487,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear487XLinearcontext,MXU,1,Memory,500,47,500,0,0,0,0,0,0,0,0,0,47,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear487XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,47,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.01724292770543332,0.3428398808346519,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-Q-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Q488MatMulQ,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487Q488MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-K-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487K488MatMulK,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487K488MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-V-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487V488MatMulV,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487V488MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +489,GatedSelfAttention-Attn487-FlashAttention-489,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn487FlashAttention489FlashAttention,MXU,1,Compute,164030,164030,3539,0,0,0,0,0,0,0,0,164030,78931,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn487FlashAttention489FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,51601,164030,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,29.57221854538804,59.65188371839374,0.06483861623985296,0.021573918162167718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +490,GatedSelfAttention-Attn487-Attention_output-490,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +491,GatedSelfAttention-Attn487-Attention_layernorm-491,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,VPU,1,Memory,1770,1006,1770,0,0,0,0,0,0,0,0,0,1006,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1006,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.935728813559322,2764.039120431674,0.5682951146560319,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +492,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +493,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +494,GatedSelfAttention-FFN487Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +495,BasicTransformerBlock-Fuser_output_layernorm495,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-Q-496,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Q496MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention496Q496MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-K-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496K496MatMulK,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496K496MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-V-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496V496MatMulV,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496V496MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +497,CrossAttention496-FlashAttention-497,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention496FlashAttention497FlashAttention,MXU,1,Compute,19314,19314,1987,0,0,0,0,0,0,0,0,19314,9637,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention496FlashAttention497FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,6425,19314,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.271604846225536,284.4135892357875,0.06856460845900791,0.10286205758979658,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +498,CrossAttention496-Attention_output-498,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +499,CrossAttention496-Attention_layernorm-499,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention496Attentionlayernorm499YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention496Attentionlayernorm499YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +500,BasicTransformerBlock-Attn_output_layernorm500,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +501,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +502,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +503,BasicTransformerBlock-FFN501Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +504,SpatialTransformer-Proj_out504,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout504einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout504einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +505,Time-Embed-MLP-Einsum505,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum505einsum,MXU,1,Memory,500,302,500,0,0,0,0,0,0,0,0,302,47,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum505einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,658048,30,1,1,320,1280,0,302,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,83,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0035922766052986083,0.5540106223246073,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +506,Conv2d-GroupNorm506,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm506XnormGroupNormX,VPU,1,Memory,3532,2009,3532,0,0,0,0,0,0,0,0,0,2009,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm506XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,2009,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,736,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +507,Conv2d506Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d506Conv2dconv2d,MXU,1,Compute,40678,40678,3891,0,0,0,0,0,0,0,0,40678,6776,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d506Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,4320,1,320,4096,5760,0,40678,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7037,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,371.19559467033775,264.4536675613968,0.8138655094751948,0.09564327940737678,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +508,Conv2d-GroupNorm508,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm508XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm508XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +509,Conv2d508Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d508Conv2dconv2d,MXU,1,Compute,20800,20800,2387,0,0,0,0,0,0,0,0,20800,3463,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d508Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,2208,1,320,4096,2880,0,20800,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3624,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,362.96861538461536,317.28010911207934,0.7958274325584609,0.11474868322317516,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +510,SkipConnection-Einsum505,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum505einsum,MXU,1,Compute,4537,4537,2787,0,0,0,0,0,0,0,0,4537,752,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum505einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,480,1,4096,320,640,0,4537,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,369.78655499228563,1698.4105083893542,0.8107761172196991,0.6142533484229129,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +511,SpatialTransformer-Input_GroupNorm511,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm511XnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm511XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +512,SpatialTransformer-Proj_in512,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin512einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin512einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +513,BasicTransformerBlock-Input_layernorm513,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm513XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm513XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-Q-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Q514MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514Q514MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-K-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514K514MatMulK,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514K514MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-V-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514V514MatMulV,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514V514MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +515,SelfAttention514-FlashAttention-515,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention514FlashAttention515FlashAttention,MXU,1,Compute,154240,154240,3532,0,0,0,0,0,0,0,0,154240,77101,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention514FlashAttention515FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,2762752,16384,8,4096,4096,40,51401,154240,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,25940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.32675186721992,63.31447743775934,0.0686855211502738,0.022898545185446414,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +516,SelfAttention514-Attention_output-516,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +517,SelfAttention514-Attention_layernorm-517,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention514Attentionlayernorm517YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention514Attentionlayernorm517YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +518,GatedSelfAttention-Linear518,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear518XLinearcontext,MXU,1,Memory,500,47,500,0,0,0,0,0,0,0,0,0,47,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear518XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,18,1,8,320,768,0,47,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.01724292770543332,0.3428398808346519,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-Q-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Q519MatMulQ,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518Q519MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-K-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518K519MatMulK,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518K519MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-V-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518V519MatMulV,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518V519MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +520,GatedSelfAttention-Attn518-FlashAttention-520,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn518FlashAttention520FlashAttention,MXU,1,Compute,164030,164030,3539,0,0,0,0,0,0,0,0,164030,78931,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn518FlashAttention520FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 128]",,2768128,17424,8,4104,4104,40,51601,164030,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27572,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,29.57221854538804,59.65188371839374,0.06483861623985296,0.021573918162167718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +521,GatedSelfAttention-Attn518-Attention_output-521,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,MXU,1,Compute,2815,2815,1839,0,0,0,0,0,0,0,0,2815,465,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,297,1,4104,320,320,0,2815,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,590,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,298.578756660746,1805.7137145454042,0.654649342279995,0.6530610179187718,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +522,GatedSelfAttention-Attn518-Attention_layernorm-522,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,VPU,1,Memory,1770,1006,1770,0,0,0,0,0,0,0,0,0,1006,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1006,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.935728813559322,2764.039120431674,0.5682951146560319,0.9996524847854155,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +523,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +524,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +525,GatedSelfAttention-FFN518Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +526,BasicTransformerBlock-Fuser_output_layernorm526,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-Q-527,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Q527MatMulQ,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention527Q527MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-K-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527K527MatMulK,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527K527MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-V-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527V527MatMulV,MXU,1,Compute,697,697,541,0,0,0,0,0,0,0,0,697,112,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527V527MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,72,1,512,320,768,0,697,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,361.05916786226686,2145.4251479555237,0.7916408702637966,0.7759222958247826,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +528,CrossAttention527-FlashAttention-528,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention527FlashAttention528FlashAttention,MXU,1,Compute,19314,19314,1987,0,0,0,0,0,0,0,0,19314,9637,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention527FlashAttention528FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 128]",,354304,2048,8,512,4096,40,6425,19314,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3350,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.271604846225536,284.4135892357875,0.06856460845900791,0.10286205758979658,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +529,CrossAttention527-Attention_output-529,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +530,CrossAttention527-Attention_layernorm-530,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention527Attentionlayernorm530YnormLayerNormy,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention527Attentionlayernorm530YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +531,BasicTransformerBlock-Attn_output_layernorm531,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +532,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,960,1,4096,1280,320,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +533,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,9055,9055,4691,0,0,0,0,0,0,0,0,9055,1505,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,11665408,960,1,4096,320,1280,0,9055,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1819,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,370.562473771397,1432.3545779265598,0.8124773592105521,0.5180305887618661,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +534,BasicTransformerBlock-FFN532Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,1766,126,1766,0,0,0,0,0,0,0,0,0,126,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,126,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,148,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7421970554926387,2764.8994903737257,0.0710590011769147,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +535,SpatialTransformer-Proj_out535,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout535einsum,MXU,1,Compute,2730,2730,1835,0,0,0,0,0,0,0,0,2730,451,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout535einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,288,1,4096,320,320,0,2730,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,307.2750183150183,1858.4422576121794,0.6737163450230357,0.6721310154112765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +536,Out536-GroupNorm,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Out536GroupNormXnormGroupNormX,VPU,1,Memory,1766,1005,1766,0,0,0,0,0,0,0,0,0,1005,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Out536GroupNormXnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1005,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.93757644394111,2764.8994903737257,0.5684720094153176,0.9999636493214198,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +537,Out537-Conv2d,"Conv2D(a=1x320x64x64,b=320x3x3x3,c=1x3x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Out537Conv2dconv2d,MXU,1,Compute,6946,6946,898,0,0,0,0,0,0,0,0,6946,1154,0,0,2663296,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,3,3,3]","[DT_BFLOAT16:(1,3,64,64)]",70778880,Out537Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 3, 3, 3], [1, 3, 64, 64]]",1,2829696,736,1,3,4096,2880,0,6946,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1217,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.189876187733947,357.095837558754,0.022341829736380634,0.1291485850122076,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json new file mode 100644 index 0000000..dc26228 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.json @@ -0,0 +1,184 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 4104674, + "overlapped_compute_time_non_pp_ns": 1207745, + "compute_only_time_non_pp_ns": 2705190, + "memory_only_time_non_pp_ns": 191739, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 4104674, + "overlapped_compute_time_chip_ns": 1207745, + "compute_only_time_chip_ns": 2705190, + "memory_only_time_chip_ns": 191739, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 6.090617671464287, + "throughput_step_per_sec_per_request": 243.62470685857147, + "latency_sec": 0.16418696, + "latency_step_sec": 0.00010261685, + "mem_footprint_GB": 94.99999904632568, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "gligen", + "model_type": "gligen", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "num_diffusion_steps": 1, + "total_num_diffusion_steps": 40, + "image_resolution": [ + 512, + 512 + ], + "image_num_channels": 3, + "use_flash_attention": true, + "fourier_embedder_config": { + "num_freqs": 64 + }, + "text_embedder_config": { + "d_model": 512, + "num_heads": 8, + "d_head": 64, + "d_ff": 2048, + "num_layers": 12, + "ffn_type": "default" + }, + "image_embedder_config": { + "model_type": "vit", + "patch_size": 2, + "d_model": 1024, + "num_heads": 16, + "d_head": 64, + "d_ff": 4096, + "num_layers": 24, + "ffn_type": "default" + }, + "spatial_condition_embedder_config": { + "model_type": "convnext", + "stem": { + "in_channels": 3, + "out_channels": 96, + "kernel_size": 4, + "stride": 4 + }, + "depths": [ + 3, + 3, + 9, + 3 + ], + "dims": [ + 96, + 192, + 384, + 768 + ] + }, + "grounding_input_config": { + "text": { + "input_seqlen": 512, + "feature_dim": 768 + }, + "bbox": { + "input_seqlen": 8, + "feature_dim": 4, + "grounding_token_feature_dim": 768 + }, + "image": { + "resolution": [ + 1024, + 1024 + ], + "image_num_channels": 3 + }, + "keypoint": { + "num_persons": 10, + "num_keypoints": 17, + "feature_dim": 256 + }, + "spatial_condition": { + "resolution": [ + 256, + 256 + ], + "num_channels": 1 + } + }, + "unet_config": { + "noisy_latent_resolution": [ + 64, + 64 + ], + "model_channels": 320, + "attention_resolutions": [ + 4, + 2, + 1 + ], + "num_res_blocks": 2, + "channel_mult": [ + 1, + 2, + 4, + 4 + ], + "num_heads": 8, + "context_dim": 768 + }, + "output_dir": "./llava_ops" + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv new file mode 100644 index 0000000..5bbb786 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv @@ -0,0 +1,635 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Time-Embed-MLP-FFi2,"XlaEinsum(a=1x320,b=320x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFi2einsum,MXU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,192,20,0,0,822400,"DT_BFLOAT16:[1,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,1280)]",819200,TimeEmbedMLPFFi2einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 320], [320, 1280], [1, 1280]]",1,822400,10,1,1,1280,320,0,192,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0007751937984496124,0.20700532036858635,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Time-Embed-MLP-FFo2,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPFFo2einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPFFo2einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Conv2d5Conv2d,"Conv2D(a=1x3x64x64,b=3x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d5Conv2dconv2d,MXU,1,Compute,544,544,500,0,0,0,0,0,0,0,0,544,64,0,0,2663296,"DT_BFLOAT16:[1,3,64,64],DT_BFLOAT16:[3,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",70778880,Conv2d5Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 3, 64, 64], [3, 320, 3, 3], [1, 320, 64, 64]]",1,2664856,32,1,320,4096,27,0,544,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,192,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,130.10823529411763,4559.5361905939435,0.0615595075239398,0.6161535392694518,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Time-Embed-MLP-Einsum6,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum6einsum,MXU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,192,20,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum6einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,822400,10,1,1,320,1280,0,192,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0007751937984496124,0.20700532036858635,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Conv2d-GroupNorm7,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm7XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm7XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Conv2d7Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d7Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d7Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Conv2d-GroupNorm9,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm9XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm9XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Conv2d9Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d9Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d9Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,SpatialTransformer-Input_GroupNorm11,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm11XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm11XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,SpatialTransformer-Proj_in12,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin12einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin12einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,BasicTransformerBlock-Input_layernorm13,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm13XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm13XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-Q-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Q14MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14Q14MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-K-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14K14MatMulK,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14K14MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,SelfAttention14-V-14,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14V14MatMulV,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention14V14MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,SelfAttention14-FlashAttention-15,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention14FlashAttention15FlashAttention,MXU,1,Compute,65600,65600,1320,0,0,0,0,0,0,0,0,65600,40960,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention14FlashAttention15FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,4870144,4096,8,4096,4096,40,32768,65600,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16549,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.65607024390243,148.86623475609755,0.03484968803176403,0.020117058750823993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,SelfAttention14-Attention_output-16,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention14Attentionoutput16MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,SelfAttention14-Attention_layernorm-17,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention14Attentionlayernorm17YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention14Attentionlayernorm17YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,GatedSelfAttention-Linear18,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear18XLinearcontext,MXU,1,Memory,500,30,500,0,0,0,0,0,0,0,0,0,30,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear18XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,6,1,8,320,768,0,30,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.0037209302325581397,0.12810165817673141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-Q-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Q19MatMulQ,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18Q19MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-K-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18K19MatMulK,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18K19MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,GatedSelfAttention-Attn18-V-19,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18V19MatMulV,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn18V19MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,GatedSelfAttention-Attn18-FlashAttention-20,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn18FlashAttention20FlashAttention,MXU,1,Compute,74048,74048,1323,0,0,0,0,0,0,0,0,74048,42144,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn18FlashAttention20FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 256]",,4879616,4624,8,4104,4104,40,32896,74048,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18661,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.50792739844425,132.1399428253042,0.030994469646338767,0.01785674903044651,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,GatedSelfAttention-Attn18-Attention_output-21,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn18Attentionoutput21MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,GatedSelfAttention-Attn18-Attention_layernorm-22,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,VPU,1,Memory,662,642,662,0,0,0,0,0,0,0,0,0,642,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn18Attentionlayernorm22YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,642,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.870453172205437,7390.255654326378,0.968655589123867,0.9986831965305917,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,GatedSelfAttention-FFN18Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN18FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,GatedSelfAttention-FFN18Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN18FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,BasicTransformerBlock-Fuser_output_layernorm26,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm26XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-Q-27,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Q27MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention27Q27MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-K-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27K27MatMulK,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27K27MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,CrossAttention27-V-27,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27V27MatMulV,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention27V27MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,CrossAttention27-FlashAttention-28,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention27FlashAttention28FlashAttention,MXU,1,Compute,8256,8256,743,0,0,0,0,0,0,0,0,8256,5120,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention27FlashAttention28FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,626688,512,8,512,4096,40,4096,8256,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.15646511627907,665.3541742369187,0.03461330448891292,0.08991272624823225,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,CrossAttention27-Attention_output-29,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention27Attentionoutput29MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,CrossAttention27-Attention_layernorm-30,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention27Attentionlayernorm30YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention27Attentionlayernorm30YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +31,BasicTransformerBlock-Attn_output_layernorm31,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm31XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +32,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +33,BasicTransformerBlock-FFN32Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN32FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +34,BasicTransformerBlock-FFN32Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN32FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +35,SpatialTransformer-Proj_out35,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout35einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout35einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +36,Time-Embed-MLP-Einsum36,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum36einsum,MXU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,192,20,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum36einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,822400,10,1,1,320,1280,0,192,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0007751937984496124,0.20700532036858635,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +37,Conv2d-GroupNorm37,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm37XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm37XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +38,Conv2d37Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d37Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d37Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +39,Conv2d-GroupNorm39,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm39XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm39XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +40,Conv2d39Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d39Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d39Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +41,SpatialTransformer-Input_GroupNorm41,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm41XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm41XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +42,SpatialTransformer-Proj_in42,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin42einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin42einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +43,BasicTransformerBlock-Input_layernorm43,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm43XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm43XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-Q-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Q44MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44Q44MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-K-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44K44MatMulK,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44K44MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +44,SelfAttention44-V-44,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44V44MatMulV,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention44V44MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +45,SelfAttention44-FlashAttention-45,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention44FlashAttention45FlashAttention,MXU,1,Compute,65600,65600,1320,0,0,0,0,0,0,0,0,65600,40960,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention44FlashAttention45FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,4870144,4096,8,4096,4096,40,32768,65600,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16549,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.65607024390243,148.86623475609755,0.03484968803176403,0.020117058750823993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +46,SelfAttention44-Attention_output-46,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention44Attentionoutput46MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +47,SelfAttention44-Attention_layernorm-47,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention44Attentionlayernorm47YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention44Attentionlayernorm47YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +48,GatedSelfAttention-Linear48,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear48XLinearcontext,MXU,1,Memory,500,30,500,0,0,0,0,0,0,0,0,0,30,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear48XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,6,1,8,320,768,0,30,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.0037209302325581397,0.12810165817673141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-Q-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Q49MatMulQ,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48Q49MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-K-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48K49MatMulK,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48K49MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +49,GatedSelfAttention-Attn48-V-49,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48V49MatMulV,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn48V49MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +50,GatedSelfAttention-Attn48-FlashAttention-50,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn48FlashAttention50FlashAttention,MXU,1,Compute,74048,74048,1323,0,0,0,0,0,0,0,0,74048,42144,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn48FlashAttention50FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 256]",,4879616,4624,8,4104,4104,40,32896,74048,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18661,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.50792739844425,132.1399428253042,0.030994469646338767,0.01785674903044651,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +51,GatedSelfAttention-Attn48-Attention_output-51,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn48Attentionoutput51MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +52,GatedSelfAttention-Attn48-Attention_layernorm-52,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,VPU,1,Memory,662,642,662,0,0,0,0,0,0,0,0,0,642,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn48Attentionlayernorm52YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,642,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.870453172205437,7390.255654326378,0.968655589123867,0.9986831965305917,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +53,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +54,GatedSelfAttention-FFN48Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN48FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +55,GatedSelfAttention-FFN48Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN48FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +56,BasicTransformerBlock-Fuser_output_layernorm56,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm56XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-Q-57,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Q57MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention57Q57MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-K-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57K57MatMulK,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57K57MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +57,CrossAttention57-V-57,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57V57MatMulV,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention57V57MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +58,CrossAttention57-FlashAttention-58,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention57FlashAttention58FlashAttention,MXU,1,Compute,8256,8256,743,0,0,0,0,0,0,0,0,8256,5120,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention57FlashAttention58FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,626688,512,8,512,4096,40,4096,8256,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.15646511627907,665.3541742369187,0.03461330448891292,0.08991272624823225,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +59,CrossAttention57-Attention_output-59,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention57Attentionoutput59MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +60,CrossAttention57-Attention_layernorm-60,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention57Attentionlayernorm60YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention57Attentionlayernorm60YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +61,BasicTransformerBlock-Attn_output_layernorm61,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm61XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +62,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +63,BasicTransformerBlock-FFN62Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN62FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +64,BasicTransformerBlock-FFN62Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN62FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +65,SpatialTransformer-Proj_out65,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout65einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout65einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +66,Downsample-Conv2d66Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d66Conv2dconv2d,MXU,1,Compute,1568,1568,645,0,0,0,0,0,0,0,0,1568,192,0,0,5120000,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,32,32)]",1887436800,DownsampleConv2d66Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 32, 32]]",1,5286400,96,1,320,1024,2880,0,1568,5120000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,464,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1203.7224489795917,3041.0533048668685,0.5695301376364499,0.4109531493063336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +67,Time-Embed-MLP-Einsum67,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum67einsum,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum67einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1642240,15,1,1,640,1280,0,272,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2768,3058.910369873047,0.0015503875968992248,0.41336626619906036,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +68,Conv2d-GroupNorm68,"GroupNorm(x=1x320x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm68XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,320,32,32]","[DT_BFLOAT16:(1,320,32,32)]",2621440,Conv2dGroupNorm68XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +69,Conv2d68Conv2d,"Conv2D(a=1x320x32x32,b=320x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d68Conv2dconv2d,MXU,1,Compute,2336,2336,712,0,0,0,0,0,0,0,0,2336,288,0,0,5652480,"DT_BFLOAT16:[1,320,32,32],DT_BFLOAT16:[320,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",3774873600,Conv2d68Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 320, 32, 32], [320, 640, 3, 3], [1, 640, 32, 32]]",1,5736960,144,1,640,1024,2880,0,2336,5652480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,664,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1615.9561643835618,2253.5454736997003,0.7645747053201657,0.3045331721215811,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +70,Conv2d-GroupNorm70,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm70XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm70XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +71,Conv2d70Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d70Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d70Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +72,SkipConnection-Einsum67,"XlaEinsum(a=1x32x32x320,b=320x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum67einsum,MXU,1,Memory,500,416,500,0,0,0,0,0,0,0,0,416,48,0,0,2375680,"DT_BFLOAT16:[1,32,32,320],DT_BFLOAT16:[320,640]","[DT_BFLOAT16:(1,32,32,640)]",419430400,SkipConnectionEinsum67einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 320], [320, 640], [1, 32, 32, 640]]",1,2375680,24,1,1024,640,320,0,416,2375680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,838.8608,4425.048828125,0.39689922480620154,0.5979795713682432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +73,SpatialTransformer-Input_GroupNorm73,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm73XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm73XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +74,SpatialTransformer-Proj_in74,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin74einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin74einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +75,BasicTransformerBlock-Input_layernorm75,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm75XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm75XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-Q-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Q76MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76Q76MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-K-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76K76MatMulK,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76K76MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +76,SelfAttention76-V-76,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76V76MatMulV,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention76V76MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +77,SelfAttention76-FlashAttention-77,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention76FlashAttention77FlashAttention,MXU,1,Compute,4160,4160,660,0,0,0,0,0,0,0,0,4160,2560,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention76FlashAttention77FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 256]",,1417216,256,8,1024,1024,80,2048,4160,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,72.59372307692308,1173.7530048076924,0.03434704830053668,0.15861527091995842,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +78,SelfAttention76-Attention_output-78,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention76Attentionoutput78MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +79,SelfAttention76-Attention_layernorm-79,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention76Attentionlayernorm79YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention76Attentionlayernorm79YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +80,GatedSelfAttention-Linear80,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear80XLinearcontext,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear80XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,9,1,8,640,768,0,176,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.0074418604651162795,0.25311031857052363,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-Q-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Q81MatMulQ,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80Q81MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-K-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80K81MatMulK,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80K81MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +81,GatedSelfAttention-Attn80-V-81,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80V81MatMulV,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn80V81MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +82,GatedSelfAttention-Attn80-FlashAttention-82,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn80FlashAttention82FlashAttention,MXU,1,Compute,6464,6464,665,0,0,0,0,0,0,0,0,6464,2880,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn80FlashAttention82FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 256]",,1427968,400,8,1032,1032,80,2080,6464,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1691,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,47.45156435643564,761.2870471312268,0.022451268564356433,0.10287662799070632,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +83,GatedSelfAttention-Attn80-Attention_output-83,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn80Attentionoutput83MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +84,GatedSelfAttention-Attn80-Attention_layernorm-84,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,VPU,1,Memory,500,323,500,0,0,0,0,0,0,0,0,0,323,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn80Attentionlayernorm84YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,323,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.56768,4920.95947265625,0.6449999999999999,0.6649945233319257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +85,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +86,GatedSelfAttention-FFN80Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN80FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +87,GatedSelfAttention-FFN80Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN80FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +88,BasicTransformerBlock-Fuser_output_layernorm88,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm88XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-Q-89,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Q89MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention89Q89MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-K-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89K89MatMulK,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89K89MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +89,CrossAttention89-V-89,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89V89MatMulV,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention89V89MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +90,CrossAttention89-FlashAttention-90,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention89FlashAttention90FlashAttention,MXU,1,Compute,2112,2112,500,0,0,0,0,0,0,0,0,2112,1280,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention89FlashAttention90FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,729088,128,8,512,1024,80,1024,2112,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,584,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,71.49381818181818,1733.9533025568182,0.03382663847780127,0.2343180138590295,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +91,CrossAttention89-Attention_output-91,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention89Attentionoutput91MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +92,CrossAttention89-Attention_layernorm-92,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention89Attentionlayernorm92YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention89Attentionlayernorm92YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +93,BasicTransformerBlock-Attn_output_layernorm93,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm93XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +94,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +95,BasicTransformerBlock-FFN94Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN94FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +96,BasicTransformerBlock-FFN94Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN94FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +97,SpatialTransformer-Proj_out97,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout97einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout97einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +98,Time-Embed-MLP-Einsum98,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum98einsum,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum98einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1642240,15,1,1,640,1280,0,272,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2768,3058.910369873047,0.0015503875968992248,0.41336626619906036,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +99,Conv2d-GroupNorm99,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm99XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm99XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +100,Conv2d99Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d99Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d99Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +101,Conv2d-GroupNorm101,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm101XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm101XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +102,Conv2d101Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d101Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d101Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +103,SpatialTransformer-Input_GroupNorm103,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm103XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm103XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +104,SpatialTransformer-Proj_in104,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin104einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin104einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +105,BasicTransformerBlock-Input_layernorm105,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm105XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm105XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-Q-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Q106MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106Q106MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-K-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106K106MatMulK,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106K106MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +106,SelfAttention106-V-106,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106V106MatMulV,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention106V106MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +107,SelfAttention106-FlashAttention-107,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention106FlashAttention107FlashAttention,MXU,1,Compute,4160,4160,660,0,0,0,0,0,0,0,0,4160,2560,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention106FlashAttention107FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 256]",,1417216,256,8,1024,1024,80,2048,4160,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,72.59372307692308,1173.7530048076924,0.03434704830053668,0.15861527091995842,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +108,SelfAttention106-Attention_output-108,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention106Attentionoutput108MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +109,SelfAttention106-Attention_layernorm-109,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention106Attentionlayernorm109YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention106Attentionlayernorm109YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +110,GatedSelfAttention-Linear110,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear110XLinearcontext,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear110XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,9,1,8,640,768,0,176,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.0074418604651162795,0.25311031857052363,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-Q-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Q111MatMulQ,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110Q111MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-K-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110K111MatMulK,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110K111MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +111,GatedSelfAttention-Attn110-V-111,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110V111MatMulV,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn110V111MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +112,GatedSelfAttention-Attn110-FlashAttention-112,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn110FlashAttention112FlashAttention,MXU,1,Compute,6464,6464,665,0,0,0,0,0,0,0,0,6464,2880,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn110FlashAttention112FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 256]",,1427968,400,8,1032,1032,80,2080,6464,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1691,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,47.45156435643564,761.2870471312268,0.022451268564356433,0.10287662799070632,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +113,GatedSelfAttention-Attn110-Attention_output-113,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn110Attentionoutput113MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +114,GatedSelfAttention-Attn110-Attention_layernorm-114,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,VPU,1,Memory,500,323,500,0,0,0,0,0,0,0,0,0,323,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn110Attentionlayernorm114YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,323,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.56768,4920.95947265625,0.6449999999999999,0.6649945233319257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +115,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +116,GatedSelfAttention-FFN110Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN110FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +117,GatedSelfAttention-FFN110Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN110FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +118,BasicTransformerBlock-Fuser_output_layernorm118,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm118XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-Q-119,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Q119MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention119Q119MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-K-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119K119MatMulK,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119K119MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +119,CrossAttention119-V-119,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119V119MatMulV,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention119V119MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +120,CrossAttention119-FlashAttention-120,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention119FlashAttention120FlashAttention,MXU,1,Compute,2112,2112,500,0,0,0,0,0,0,0,0,2112,1280,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention119FlashAttention120FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,729088,128,8,512,1024,80,1024,2112,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,584,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,71.49381818181818,1733.9533025568182,0.03382663847780127,0.2343180138590295,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +121,CrossAttention119-Attention_output-121,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention119Attentionoutput121MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +122,CrossAttention119-Attention_layernorm-122,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention119Attentionlayernorm122YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention119Attentionlayernorm122YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +123,BasicTransformerBlock-Attn_output_layernorm123,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm123XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +124,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +125,BasicTransformerBlock-FFN124Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN124FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +126,BasicTransformerBlock-FFN124Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN124FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +127,SpatialTransformer-Proj_out127,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout127einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout127einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +128,Downsample-Conv2d128Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d128Conv2dconv2d,MXU,1,Compute,1136,1136,1135,0,0,0,0,0,0,0,0,1136,138,0,0,9011200,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,16,16)]",1887436800,DownsampleConv2d128Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 16, 16]]",1,9180160,69,1,640,256,5760,0,1136,9011200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,412,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1661.4760563380282,7387.617943992077,0.7861120209629873,0.9983267491881185,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +129,Time-Embed-MLP-Einsum129,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum129einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum129einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +130,Conv2d-GroupNorm130,"GroupNorm(x=1x640x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm130XnormGroupNormX,VPU,1,Memory,500,80,500,0,0,0,0,0,0,0,0,0,80,0,0,655360,"DT_BFLOAT16:[1,640,16,16]","[DT_BFLOAT16:(1,640,16,16)]",1310720,Conv2dGroupNorm130XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,80,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.16,0.16495988175675674,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +131,Conv2d130Conv2d,"Conv2D(a=1x640x16x16,b=640x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d130Conv2dconv2d,MXU,1,Memory,1980,1872,1980,0,0,0,0,0,0,0,0,1872,230,0,0,15728640,"DT_BFLOAT16:[1,640,16,16],DT_BFLOAT16:[640,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",3774873600,Conv2d130Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 640, 16, 16], [640, 1280, 3, 3], [1, 1280, 16, 16]]",1,15815680,115,1,1280,256,5760,0,1872,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,691,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1906.5018181818182,7398.200757575758,0.9020436927413671,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +132,Conv2d-GroupNorm132,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm132XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm132XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +133,Conv2d132Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d132Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d132Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +134,SkipConnection-Einsum129,"XlaEinsum(a=1x16x16x640,b=640x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum129einsum,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,2621440,"DT_BFLOAT16:[1,16,16,640],DT_BFLOAT16:[640,1280]","[DT_BFLOAT16:(1,16,16,1280)]",419430400,SkipConnectionEinsum129einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 640], [640, 1280], [1, 16, 16, 1280]]",1,2621440,15,1,256,1280,640,0,272,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,838.8608,4882.8125,0.39689922480620154,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +135,SpatialTransformer-Input_GroupNorm135,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm135XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm135XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +136,SpatialTransformer-Proj_in136,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin136einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin136einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +137,BasicTransformerBlock-Input_layernorm137,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm137XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm137XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-Q-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Q138MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138Q138MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-K-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138K138MatMulK,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138K138MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +138,SelfAttention138-V-138,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138V138MatMulV,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention138V138MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +139,SelfAttention138-FlashAttention-139,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention138FlashAttention139FlashAttention,MXU,1,Memory,21445,320,21445,0,0,0,0,0,0,0,0,320,160,0,0,170393600,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention138FlashAttention139FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 256]",,170393600,16,8,256,256,160,128,320,170393600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8801290743763115,7399.925681977151,0.0004164249269358608,0.9999899570239393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +140,SelfAttention138-Attention_output-140,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention138Attentionoutput140MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +141,SelfAttention138-Attention_layernorm-141,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention138Attentionlayernorm141YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention138Attentionlayernorm141YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +142,GatedSelfAttention-Linear142,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear142XLinearcontext,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear142XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,15,1,8,1280,768,0,272,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.45728,3723.14453125,0.014883720930232559,0.5031276393581081,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-Q-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Q143MatMulQ,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142Q143MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-K-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142K143MatMulK,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142K143MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +143,GatedSelfAttention-Attn142-V-143,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142V143MatMulV,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn142V143MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +144,GatedSelfAttention-Attn142-FlashAttention-144,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn142FlashAttention144FlashAttention,MXU,1,Memory,22796,1088,22796,0,0,0,0,0,0,0,0,1088,264,0,0,181125120,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn142FlashAttention144FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 264]",,181125120,64,8,264,264,160,136,1088,181125120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2846,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8805250043867345,7399.80317099217,0.00041661225755640525,0.9999734014854283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +145,GatedSelfAttention-Attn142-Attention_output-145,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn142Attentionoutput145MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +146,GatedSelfAttention-Attn142-Attention_layernorm-146,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,VPU,1,Memory,500,165,500,0,0,0,0,0,0,0,0,0,165,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn142Attentionlayernorm146YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,165,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.33,0.3402297561233108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +147,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +148,GatedSelfAttention-FFN142Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN142FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +149,GatedSelfAttention-FFN142Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN142FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +150,BasicTransformerBlock-Fuser_output_layernorm150,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm150XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-Q-151,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Q151MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention151Q151MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-K-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151K151MatMulK,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151K151MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +151,CrossAttention151-V-151,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151V151MatMulV,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention151V151MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +152,CrossAttention151-FlashAttention-152,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention151FlashAttention152FlashAttention,MXU,1,Compute,576,576,500,0,0,0,0,0,0,0,0,576,320,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention151FlashAttention152FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,933888,32,8,512,256,160,256,576,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.536,6357.828776041667,0.031007751937984496,0.8591660508164415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +153,CrossAttention151-Attention_output-153,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention151Attentionoutput153MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +154,CrossAttention151-Attention_layernorm-154,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention151Attentionlayernorm154YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention151Attentionlayernorm154YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +155,BasicTransformerBlock-Attn_output_layernorm155,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm155XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +156,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +157,BasicTransformerBlock-FFN156Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN156FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +158,BasicTransformerBlock-FFN156Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN156FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +159,SpatialTransformer-Proj_out159,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout159einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout159einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +160,Time-Embed-MLP-Einsum160,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum160einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum160einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +161,Conv2d-GroupNorm161,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm161XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm161XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +162,Conv2d161Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d161Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d161Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +163,Conv2d-GroupNorm163,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm163XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm163XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +164,Conv2d163Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d163Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d163Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +165,SpatialTransformer-Input_GroupNorm165,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm165XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm165XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +166,SpatialTransformer-Proj_in166,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin166einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin166einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +167,BasicTransformerBlock-Input_layernorm167,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm167XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm167XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-Q-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Q168MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168Q168MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-K-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168K168MatMulK,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168K168MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +168,SelfAttention168-V-168,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168V168MatMulV,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention168V168MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +169,SelfAttention168-FlashAttention-169,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention168FlashAttention169FlashAttention,MXU,1,Memory,21445,320,21445,0,0,0,0,0,0,0,0,320,160,0,0,170393600,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention168FlashAttention169FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 256]",,170393600,16,8,256,256,160,128,320,170393600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8801290743763115,7399.925681977151,0.0004164249269358608,0.9999899570239393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +170,SelfAttention168-Attention_output-170,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention168Attentionoutput170MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +171,SelfAttention168-Attention_layernorm-171,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention168Attentionlayernorm171YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention168Attentionlayernorm171YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +172,GatedSelfAttention-Linear172,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear172XLinearcontext,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear172XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,15,1,8,1280,768,0,272,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.45728,3723.14453125,0.014883720930232559,0.5031276393581081,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-Q-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Q173MatMulQ,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172Q173MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-K-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172K173MatMulK,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172K173MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +173,GatedSelfAttention-Attn172-V-173,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172V173MatMulV,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn172V173MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +174,GatedSelfAttention-Attn172-FlashAttention-174,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn172FlashAttention174FlashAttention,MXU,1,Memory,22796,1088,22796,0,0,0,0,0,0,0,0,1088,264,0,0,181125120,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn172FlashAttention174FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 264]",,181125120,64,8,264,264,160,136,1088,181125120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2846,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8805250043867345,7399.80317099217,0.00041661225755640525,0.9999734014854283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +175,GatedSelfAttention-Attn172-Attention_output-175,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn172Attentionoutput175MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +176,GatedSelfAttention-Attn172-Attention_layernorm-176,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,VPU,1,Memory,500,165,500,0,0,0,0,0,0,0,0,0,165,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn172Attentionlayernorm176YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,165,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.33,0.3402297561233108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +177,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +178,GatedSelfAttention-FFN172Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN172FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +179,GatedSelfAttention-FFN172Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN172FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +180,BasicTransformerBlock-Fuser_output_layernorm180,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm180XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-Q-181,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Q181MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention181Q181MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-K-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181K181MatMulK,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181K181MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +181,CrossAttention181-V-181,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181V181MatMulV,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention181V181MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +182,CrossAttention181-FlashAttention-182,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention181FlashAttention182FlashAttention,MXU,1,Compute,576,576,500,0,0,0,0,0,0,0,0,576,320,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention181FlashAttention182FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,933888,32,8,512,256,160,256,576,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.536,6357.828776041667,0.031007751937984496,0.8591660508164415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +183,CrossAttention181-Attention_output-183,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention181Attentionoutput183MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +184,CrossAttention181-Attention_layernorm-184,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention181Attentionlayernorm184YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention181Attentionlayernorm184YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +185,BasicTransformerBlock-Attn_output_layernorm185,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm185XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +186,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +187,BasicTransformerBlock-FFN186Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN186FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +188,BasicTransformerBlock-FFN186Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN186FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +189,SpatialTransformer-Proj_out189,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout189einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout189einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +190,Downsample-Conv2d190Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=2x2 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",DownsampleConv2d190Conv2dconv2d,MXU,1,Memory,3815,3632,3815,0,0,0,0,0,0,0,0,3632,450,0,0,30310400,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,DownsampleConv2d190Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,30484480,225,1,1280,64,11520,0,3632,30310400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,494.74096985583225,7399.412782601573,0.2340821116157152,0.9999206462975099,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +191,Time-Embed-MLP-Einsum191,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum191einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum191einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +192,Conv2d-GroupNorm192,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm192XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm192XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +193,Conv2d192Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d192Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d192Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +194,Conv2d-GroupNorm194,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm194XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm194XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +195,Conv2d194Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d194Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d194Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +196,Time-Embed-MLP-Einsum196,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum196einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum196einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +197,Conv2d-GroupNorm197,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm197XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm197XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +198,Conv2d197Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d197Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d197Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +199,Conv2d-GroupNorm199,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm199XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm199XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +200,Conv2d199Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d199Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d199Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +201,Time-Embed-MLP-Einsum201,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum201einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum201einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +202,Conv2d-GroupNorm202,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm202XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm202XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +203,Conv2d202Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d202Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d202Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +204,Conv2d-GroupNorm204,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm204XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm204XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +205,Conv2d204Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d204Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d204Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +206,SpatialTransformer-Input_GroupNorm206,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm206XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,SpatialTransformerInputGroupNorm206XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +207,SpatialTransformer-Proj_in207,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin207einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjin207einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +208,BasicTransformerBlock-Input_layernorm208,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm208XnormLayerNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockInputlayernorm208XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-Q-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Q209MatMulQ,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209Q209MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-K-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209K209MatMulK,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209K209MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +209,SelfAttention209-V-209,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209V209MatMulV,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,SelfAttention209V209MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +210,SelfAttention209-FlashAttention-210,"FlashAttention(q=1x64x8x160,k=1x64x8x160,v=1x64x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention209FlashAttention210FlashAttention,MXU,1,Memory,1403,320,1403,0,0,0,0,0,0,0,0,320,40,0,0,11141120,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,64,8,160]","[DT_BFLOAT16:(1,64,8,8)]",1179648,SelfAttention209FlashAttention210FlashAttention,FlashAttention,0,[],FlashAttention,,"[64, 64]",,11141120,16,8,64,64,160,8,320,11141120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8408039914468995,7395.56419280114,0.0003978186278572494,0.9994005665947486,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +211,SelfAttention209-Attention_output-211,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SelfAttention209Attentionoutput211MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +212,SelfAttention209-Attention_layernorm-212,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention209Attentionlayernorm212YnormLayerNormy,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,SelfAttention209Attentionlayernorm212YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +213,GatedSelfAttention-Linear213,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear213XLinearcontext,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear213XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,15,1,8,1280,768,0,272,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.45728,3723.14453125,0.014883720930232559,0.5031276393581081,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-Q-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Q214MatMulQ,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213Q214MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,3645440,25,1,72,1280,1280,0,432,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,471.8592,6790.1611328125,0.22325581395348837,0.9175893422719594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-K-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213K214MatMulK,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213K214MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,3645440,25,1,72,1280,1280,0,432,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,471.8592,6790.1611328125,0.22325581395348837,0.9175893422719594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +214,GatedSelfAttention-Attn213-V-214,"XlaEinsum(a=1x72x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213V214MatMulV,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3645440,"DT_BFLOAT16:[1,72,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,72,8,160)]",235929600,GatedSelfAttentionAttn213V214MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 72, 1280], [1280, 8, 160], [1, 72, 8, 160]]",1,3645440,25,1,72,1280,1280,0,432,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,471.8592,6790.1611328125,0.22325581395348837,0.9175893422719594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +215,GatedSelfAttention-Attn213-FlashAttention-215,"FlashAttention(q=1x72x8x160,k=1x72x8x160,v=1x72x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn213FlashAttention215FlashAttention,MXU,1,Memory,1764,320,1764,0,0,0,0,0,0,0,0,320,42,0,0,14008320,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[1,72,8,160]","[DT_BFLOAT16:(1,72,8,8)]",1492992,GatedSelfAttentionAttn213FlashAttention215FlashAttention,FlashAttention,0,[],FlashAttention,,"[72, 72]",,14008320,16,8,72,72,160,10,320,14008320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,279,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8463673469387755,7395.841637436224,0.0004004508780256288,0.9994380591130032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +216,GatedSelfAttention-Attn213-Attention_output-216,"XlaEinsum(a=1x72x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3645440,"DT_BFLOAT16:[1,72,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,72,1280)]",235929600,GatedSelfAttentionAttn213Attentionoutput216MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 72, 8, 160], [8, 160, 1280], [1, 72, 1280]]",1,3645440,25,1,72,1280,1280,0,432,3645440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,471.8592,6790.1611328125,0.22325581395348837,0.9175893422719594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +217,GatedSelfAttention-Attn213-Attention_layernorm-217,"LayerNorm(x=1x72x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,VPU,1,Memory,500,45,500,0,0,0,0,0,0,0,0,0,45,0,0,368640,"DT_BFLOAT16:[1,72,1280]","[DT_BFLOAT16:(1,72,1280)]",737280,GatedSelfAttentionAttn213Attentionlayernorm217YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,45,368640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.47456,686.6455078125,0.09000000000000001,0.09278993348817567,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +218,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,1753,1632,1753,0,0,0,0,0,0,0,0,1632,200,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,13926400,100,1,64,5120,1280,0,1632,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,605,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,478.5286936679977,7398.728296135197,0.22641142316383428,0.9998281481263779,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +219,GatedSelfAttention-FFN213Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,1753,1632,1753,0,0,0,0,0,0,0,0,1632,200,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,GatedSelfAttentionFFN213FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,5668864,100,1,64,1280,5120,0,1632,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,605,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,478.5286936679977,7398.728296135197,0.22641142316383428,0.9998281481263779,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +220,GatedSelfAttention-FFN213Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,GatedSelfAttentionFFN213FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,5,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.01,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +221,BasicTransformerBlock-Fuser_output_layernorm221,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockFuseroutputlayernorm221XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-Q-222,"XlaEinsum(a=1x64x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Q222MatMulQ,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,64,8,160)]",209715200,CrossAttention222Q222MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 64, 1280], [1280, 8, 160], [1, 64, 8, 160]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-K-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222K222MatMulK,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222K222MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +222,CrossAttention222-V-222,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222V222MatMulV,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention222V222MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +223,CrossAttention222-FlashAttention-223,"FlashAttention(q=1x64x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention222FlashAttention223FlashAttention,MXU,1,Memory,10929,576,10929,0,0,0,0,0,0,0,0,576,128,0,0,86835200,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,64,8,8)]",9437184,CrossAttention222FlashAttention223FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 64]",,86835200,32,8,512,64,160,64,576,86835200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1378,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8634993137524019,7399.723856825876,0.00040855670958640013,0.9999626833548482,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +224,CrossAttention222-Attention_output-224,"XlaEinsum(a=1x64x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,CrossAttention222Attentionoutput224MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 64, 8, 160], [8, 160, 1280], [1, 64, 1280]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +225,CrossAttention222-Attention_layernorm-225,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention222Attentionlayernorm225YnormLayerNormy,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,CrossAttention222Attentionlayernorm225YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +226,BasicTransformerBlock-Attn_output_layernorm226,"LayerNorm(x=1x64x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",655360,BasicTransformerBlockAttnoutputlayernorm226XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +227,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,1753,1632,1753,0,0,0,0,0,0,0,0,1632,200,0,0,13926400,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,64,5120)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 64, 1280], [1280, 5120], [1, 64, 5120]]",1,13926400,100,1,64,5120,1280,0,1632,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,605,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,478.5286936679977,7398.728296135197,0.22641142316383428,0.9998281481263779,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +228,BasicTransformerBlock-FFN227Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x64x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,1753,1632,1753,0,0,0,0,0,0,0,0,1632,200,0,0,13926400,"DT_BFLOAT16:[1,64,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,64,1280)]",838860800,BasicTransformerBlockFFN227FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 64, 5120], [5120, 1280], [1, 64, 1280]]",1,5668864,100,1,64,1280,5120,0,1632,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,605,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,478.5286936679977,7398.728296135197,0.22641142316383428,0.9998281481263779,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +229,BasicTransformerBlock-FFN227Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x64x1280,b=1x64x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,327680,"DT_BFLOAT16:[1,64,1280]","[DT_BFLOAT16:(1,64,1280)]",81920,BasicTransformerBlockFFN227FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,5,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16384,610.3515625,0.01,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +230,SpatialTransformer-Proj_out230,"XlaEinsum(a=1x64x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout230einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3604480,"DT_BFLOAT16:[1,64,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,64,1280)]",209715200,SpatialTransformerProjout230einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 64, 1280], [1280, 1280], [1, 64, 1280]]",1,3604480,25,1,64,1280,1280,0,432,3604480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,419.4304,6713.8671875,0.19844961240310077,0.9072793496621622,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +231,Time-Embed-MLP-Einsum231,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum231einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum231einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +232,Conv2d-GroupNorm232,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm232XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm232XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +233,Conv2d232Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d232Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d232Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +234,Conv2d-GroupNorm234,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm234XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm234XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +235,Conv2d234Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d234Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d234Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +236,Time-Embed-MLP-Einsum236,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum236einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum236einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +237,Conv2d-GroupNorm237,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm237XnormGroupNormX,VPU,1,Memory,500,80,500,0,0,0,0,0,0,0,0,0,80,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm237XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,80,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.16,0.16495988175675674,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +238,Conv2d237Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d237Conv2dconv2d,MXU,1,Memory,7486,7232,7486,0,0,0,0,0,0,0,0,7232,900,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d237Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,47759360,450,1,1280,64,23040,0,7232,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2653,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,504.2577611541544,7399.065495174325,0.23858489335131003,0.999873715564098,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +239,Conv2d-GroupNorm239,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm239XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm239XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +240,Conv2d239Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d239Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d239Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +241,SkipConnection-Einsum236,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum236einsum,MXU,1,Memory,887,832,887,0,0,0,0,0,0,0,0,832,100,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum236einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,5668864,50,1,64,1280,2560,0,832,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,308,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,472.86403607666296,7397.158170095829,0.22373124284453302,0.9996159689318688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +242,Time-Embed-MLP-Einsum242,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum242einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum242einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +243,Conv2d-GroupNorm243,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm243XnormGroupNormX,VPU,1,Memory,500,80,500,0,0,0,0,0,0,0,0,0,80,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm243XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,80,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.16,0.16495988175675674,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +244,Conv2d243Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d243Conv2dconv2d,MXU,1,Memory,7486,7232,7486,0,0,0,0,0,0,0,0,7232,900,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d243Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,47759360,450,1,1280,64,23040,0,7232,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2653,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,504.2577611541544,7399.065495174325,0.23858489335131003,0.999873715564098,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +245,Conv2d-GroupNorm245,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm245XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm245XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +246,Conv2d245Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d245Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d245Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +247,SkipConnection-Einsum242,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum242einsum,MXU,1,Memory,887,832,887,0,0,0,0,0,0,0,0,832,100,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum242einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,5668864,50,1,64,1280,2560,0,832,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,308,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,472.86403607666296,7397.158170095829,0.22373124284453302,0.9996159689318688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +248,Time-Embed-MLP-Einsum248,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum248einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum248einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +249,Conv2d-GroupNorm249,"GroupNorm(x=1x2560x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm249XnormGroupNormX,VPU,1,Memory,500,80,500,0,0,0,0,0,0,0,0,0,80,0,0,655360,"DT_BFLOAT16:[1,2560,8,8]","[DT_BFLOAT16:(1,2560,8,8)]",1310720,Conv2dGroupNorm249XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,80,655360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.62144,1220.703125,0.16,0.16495988175675674,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +250,Conv2d249Conv2d,"Conv2D(a=1x2560x8x8,b=2560x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d249Conv2dconv2d,MXU,1,Memory,7486,7232,7486,0,0,0,0,0,0,0,0,7232,900,0,0,59473920,"DT_BFLOAT16:[1,2560,8,8],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",3774873600,Conv2d249Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 8, 8], [2560, 1280, 3, 3], [1, 1280, 8, 8]]",1,47759360,450,1,1280,64,23040,0,7232,59473920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2653,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,504.2577611541544,7399.065495174325,0.23858489335131003,0.999873715564098,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +251,Conv2d-GroupNorm251,"GroupNorm(x=1x1280x8x8,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm251XnormGroupNormX,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,327680,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,8,8)]",655360,Conv2dGroupNorm251XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,40,327680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,610.3515625,0.08,0.08247994087837837,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +252,Conv2d251Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d251Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,Conv2d251Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +253,SkipConnection-Einsum248,"XlaEinsum(a=1x8x8x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum248einsum,MXU,1,Memory,887,832,887,0,0,0,0,0,0,0,0,832,100,0,0,7045120,"DT_BFLOAT16:[1,8,8,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,8,8,1280)]",419430400,SkipConnectionEinsum248einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 8, 8, 2560], [2560, 1280], [1, 8, 8, 1280]]",1,5668864,50,1,64,1280,2560,0,832,7045120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,308,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,472.86403607666296,7397.158170095829,0.22373124284453302,0.9996159689318688,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample254,"Upsample(a=1x1280x8x8,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample254Upsample,VPU,1,Memory,500,0,500,0,0,0,0,0,0,0,0,0,0,0,0,819200,"DT_BFLOAT16:[1,1280,8,8]","[DT_BFLOAT16:(1,1280,16,16)]",0,Upsample254Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,819200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1525.87890625,0.0,0.20619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +254,Upsample-Conv2d254Conv2d,"Conv2D(a=1x1280x8x8,b=1280x1280x3x3,c=1x1280x8x8,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d254Conv2dconv2d,MXU,1,Memory,3753,3632,3753,0,0,0,0,0,0,0,0,3632,450,0,0,29818880,"DT_BFLOAT16:[1,1280,8,8],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,8,8)]",1887436800,UpsampleConv2d254Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 8, 8], [1280, 1280, 3, 3], [1, 1280, 8, 8]]",1,29911040,225,1,1280,64,11520,0,3632,29818880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1331,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,502.9141486810551,7399.679214961364,0.23794917554328626,0.9999566506704546,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +255,Time-Embed-MLP-Einsum255,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum255einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum255einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +256,Conv2d-GroupNorm256,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm256XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm256XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +257,Conv2d256Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d256Conv2dconv2d,MXU,1,Memory,7671,7232,7671,0,0,0,0,0,0,0,0,7232,900,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d256Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 16, 16], [2560, 1280, 3, 3], [1, 1280, 16, 16]]",1,49168384,450,1,1280,256,23040,0,7232,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2674,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1968.3867031677748,7399.647413961674,0.9313239533974225,0.9999523532380641,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +258,Conv2d-GroupNorm258,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm258XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm258XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +259,Conv2d258Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d258Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d258Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +260,SkipConnection-Einsum255,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum255einsum,MXU,1,Memory,1073,832,1073,0,0,0,0,0,0,0,0,832,100,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum255einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,6946816,50,1,256,1280,2560,0,832,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,329,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1563.5802423112768,7394.753320130475,0.7397935224719507,0.9992909892068209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +261,SpatialTransformer-Input_GroupNorm261,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm261XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm261XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +262,SpatialTransformer-Proj_in262,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin262einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin262einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +263,BasicTransformerBlock-Input_layernorm263,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm263XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm263XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-Q-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Q264MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264Q264MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-K-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264K264MatMulK,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264K264MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +264,SelfAttention264-V-264,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264V264MatMulV,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention264V264MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +265,SelfAttention264-FlashAttention-265,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention264FlashAttention265FlashAttention,MXU,1,Memory,21445,320,21445,0,0,0,0,0,0,0,0,320,160,0,0,170393600,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention264FlashAttention265FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 256]",,170393600,16,8,256,256,160,128,320,170393600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8801290743763115,7399.925681977151,0.0004164249269358608,0.9999899570239393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +266,SelfAttention264-Attention_output-266,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention264Attentionoutput266MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +267,SelfAttention264-Attention_layernorm-267,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention264Attentionlayernorm267YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention264Attentionlayernorm267YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +268,GatedSelfAttention-Linear268,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear268XLinearcontext,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear268XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,15,1,8,1280,768,0,272,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.45728,3723.14453125,0.014883720930232559,0.5031276393581081,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-Q-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Q269MatMulQ,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268Q269MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-K-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268K269MatMulK,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268K269MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +269,GatedSelfAttention-Attn268-V-269,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268V269MatMulV,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn268V269MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +270,GatedSelfAttention-Attn268-FlashAttention-270,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn268FlashAttention270FlashAttention,MXU,1,Memory,22796,1088,22796,0,0,0,0,0,0,0,0,1088,264,0,0,181125120,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn268FlashAttention270FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 264]",,181125120,64,8,264,264,160,136,1088,181125120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2846,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8805250043867345,7399.80317099217,0.00041661225755640525,0.9999734014854283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +271,GatedSelfAttention-Attn268-Attention_output-271,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn268Attentionoutput271MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +272,GatedSelfAttention-Attn268-Attention_layernorm-272,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,VPU,1,Memory,500,165,500,0,0,0,0,0,0,0,0,0,165,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn268Attentionlayernorm272YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,165,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.33,0.3402297561233108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +273,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +274,GatedSelfAttention-FFN268Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN268FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +275,GatedSelfAttention-FFN268Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN268FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +276,BasicTransformerBlock-Fuser_output_layernorm276,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm276XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-Q-277,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Q277MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention277Q277MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-K-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277K277MatMulK,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277K277MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +277,CrossAttention277-V-277,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277V277MatMulV,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention277V277MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +278,CrossAttention277-FlashAttention-278,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention277FlashAttention278FlashAttention,MXU,1,Compute,576,576,500,0,0,0,0,0,0,0,0,576,320,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention277FlashAttention278FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,933888,32,8,512,256,160,256,576,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.536,6357.828776041667,0.031007751937984496,0.8591660508164415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +279,CrossAttention277-Attention_output-279,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention277Attentionoutput279MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +280,CrossAttention277-Attention_layernorm-280,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention277Attentionlayernorm280YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention277Attentionlayernorm280YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +281,BasicTransformerBlock-Attn_output_layernorm281,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm281XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +282,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +283,BasicTransformerBlock-FFN282Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN282FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +284,BasicTransformerBlock-FFN282Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN282FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +285,SpatialTransformer-Proj_out285,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout285einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout285einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +286,Time-Embed-MLP-Einsum286,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum286einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum286einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +287,Conv2d-GroupNorm287,"GroupNorm(x=1x2560x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm287XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,2560,16,16]","[DT_BFLOAT16:(1,2560,16,16)]",5242880,Conv2dGroupNorm287XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +288,Conv2d287Conv2d,"Conv2D(a=1x2560x16x16,b=2560x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d287Conv2dconv2d,MXU,1,Memory,7671,7232,7671,0,0,0,0,0,0,0,0,7232,900,0,0,60948480,"DT_BFLOAT16:[1,2560,16,16],DT_BFLOAT16:[2560,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",15099494400,Conv2d287Conv2dconv2d,Conv2D,58982400,[],Conv2D,bf01;io01->bf01,"[[1, 2560, 16, 16], [2560, 1280, 3, 3], [1, 1280, 16, 16]]",1,49168384,450,1,1280,256,23040,0,7232,60948480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2674,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1968.3867031677748,7399.647413961674,0.9313239533974225,0.9999523532380641,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +289,Conv2d-GroupNorm289,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm289XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm289XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +290,Conv2d289Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d289Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d289Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +291,SkipConnection-Einsum286,"XlaEinsum(a=1x16x16x2560,b=2560x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum286einsum,MXU,1,Memory,1073,832,1073,0,0,0,0,0,0,0,0,832,100,0,0,8519680,"DT_BFLOAT16:[1,16,16,2560],DT_BFLOAT16:[2560,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1677721600,SkipConnectionEinsum286einsum,Einsum,6553600,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 2560], [2560, 1280], [1, 16, 16, 1280]]",1,6946816,50,1,256,1280,2560,0,832,8519680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,329,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1563.5802423112768,7394.753320130475,0.7397935224719507,0.9992909892068209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +292,SpatialTransformer-Input_GroupNorm292,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm292XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm292XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +293,SpatialTransformer-Proj_in293,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin293einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin293einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +294,BasicTransformerBlock-Input_layernorm294,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm294XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm294XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-Q-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Q295MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295Q295MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-K-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295K295MatMulK,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295K295MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +295,SelfAttention295-V-295,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295V295MatMulV,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention295V295MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +296,SelfAttention295-FlashAttention-296,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention295FlashAttention296FlashAttention,MXU,1,Memory,21445,320,21445,0,0,0,0,0,0,0,0,320,160,0,0,170393600,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention295FlashAttention296FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 256]",,170393600,16,8,256,256,160,128,320,170393600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8801290743763115,7399.925681977151,0.0004164249269358608,0.9999899570239393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +297,SelfAttention295-Attention_output-297,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention295Attentionoutput297MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +298,SelfAttention295-Attention_layernorm-298,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention295Attentionlayernorm298YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention295Attentionlayernorm298YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +299,GatedSelfAttention-Linear299,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear299XLinearcontext,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear299XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,15,1,8,1280,768,0,272,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.45728,3723.14453125,0.014883720930232559,0.5031276393581081,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-Q-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Q300MatMulQ,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299Q300MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-K-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299K300MatMulK,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299K300MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +300,GatedSelfAttention-Attn299-V-300,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299V300MatMulV,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn299V300MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +301,GatedSelfAttention-Attn299-FlashAttention-301,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn299FlashAttention301FlashAttention,MXU,1,Memory,22796,1088,22796,0,0,0,0,0,0,0,0,1088,264,0,0,181125120,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn299FlashAttention301FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 264]",,181125120,64,8,264,264,160,136,1088,181125120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2846,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8805250043867345,7399.80317099217,0.00041661225755640525,0.9999734014854283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +302,GatedSelfAttention-Attn299-Attention_output-302,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn299Attentionoutput302MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +303,GatedSelfAttention-Attn299-Attention_layernorm-303,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,VPU,1,Memory,500,165,500,0,0,0,0,0,0,0,0,0,165,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn299Attentionlayernorm303YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,165,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.33,0.3402297561233108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +304,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +305,GatedSelfAttention-FFN299Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN299FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +306,GatedSelfAttention-FFN299Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN299FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +307,BasicTransformerBlock-Fuser_output_layernorm307,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm307XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-Q-308,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Q308MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention308Q308MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-K-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308K308MatMulK,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308K308MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +308,CrossAttention308-V-308,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308V308MatMulV,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention308V308MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +309,CrossAttention308-FlashAttention-309,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention308FlashAttention309FlashAttention,MXU,1,Compute,576,576,500,0,0,0,0,0,0,0,0,576,320,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention308FlashAttention309FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,933888,32,8,512,256,160,256,576,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.536,6357.828776041667,0.031007751937984496,0.8591660508164415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +310,CrossAttention308-Attention_output-310,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention308Attentionoutput310MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +311,CrossAttention308-Attention_layernorm-311,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention308Attentionlayernorm311YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention308Attentionlayernorm311YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +312,BasicTransformerBlock-Attn_output_layernorm312,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm312XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +313,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +314,BasicTransformerBlock-FFN313Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN313FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +315,BasicTransformerBlock-FFN313Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN313FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +316,SpatialTransformer-Proj_out316,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout316einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout316einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +317,Time-Embed-MLP-Einsum317,"XlaEinsum(a=1x1280,b=1280x1280,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum317einsum,MXU,1,Memory,500,432,500,0,0,0,0,0,0,0,0,432,50,0,0,3281920,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,1280)]",3276800,TimeEmbedMLPEinsum317einsum,Einsum,3276800,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 1280], [1, 1280]]",1,3281920,25,1,1,1280,1280,0,432,3281920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.5536,6113.0523681640625,0.0031007751937984496,0.8260881578600084,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +318,Conv2d-GroupNorm318,"GroupNorm(x=1x1920x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm318XnormGroupNormX,VPU,1,Memory,500,240,500,0,0,0,0,0,0,0,0,0,240,0,0,1966080,"DT_BFLOAT16:[1,1920,16,16]","[DT_BFLOAT16:(1,1920,16,16)]",3932160,Conv2dGroupNorm318XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,240,1966080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,3662.109375,0.48,0.4948796452702703,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +319,Conv2d318Conv2d,"Conv2D(a=1x1920x16x16,b=1920x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d318Conv2dconv2d,MXU,1,Memory,5774,5472,5774,0,0,0,0,0,0,0,0,5472,680,0,0,45875200,"DT_BFLOAT16:[1,1920,16,16],DT_BFLOAT16:[1920,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",11324620800,Conv2d318Conv2dconv2d,Conv2D,44236800,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 16, 16], [1920, 1280, 3, 3], [1, 1280, 16, 16]]",1,46136320,340,1,1280,256,17280,0,5472,45875200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2019,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1961.3129199861448,7399.48205316938,0.9279770583449465,0.9999300071850514,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +320,Conv2d-GroupNorm320,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm320XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,Conv2dGroupNorm320XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +321,Conv2d320Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d320Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,Conv2d320Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +322,SkipConnection-Einsum317,"XlaEinsum(a=1x16x16x1920,b=1920x1280,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum317einsum,MXU,1,Memory,825,672,825,0,0,0,0,0,0,0,0,672,80,0,0,6553600,"DT_BFLOAT16:[1,16,16,1920],DT_BFLOAT16:[1920,1280]","[DT_BFLOAT16:(1,16,16,1280)]",1258291200,SkipConnectionEinsum317einsum,Einsum,4915200,[],Einsum,"BHWC,CO->BHWO","[[1, 16, 16, 1920], [1920, 1280], [1, 16, 16, 1280]]",1,6553600,40,1,256,1280,1920,0,672,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1525.2014545454545,7398.200757575758,0.7216349541930936,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +323,SpatialTransformer-Input_GroupNorm323,"GroupNorm(x=1x1280x16x16,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm323XnormGroupNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,16,16)]",2621440,SpatialTransformerInputGroupNorm323XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +324,SpatialTransformer-Proj_in324,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin324einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjin324einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +325,BasicTransformerBlock-Input_layernorm325,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm325XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockInputlayernorm325XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-Q-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Q326MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326Q326MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-K-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326K326MatMulK,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326K326MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +326,SelfAttention326-V-326,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326V326MatMulV,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,SelfAttention326V326MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +327,SelfAttention326-FlashAttention-327,"FlashAttention(q=1x256x8x160,k=1x256x8x160,v=1x256x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention326FlashAttention327FlashAttention,MXU,1,Memory,21445,320,21445,0,0,0,0,0,0,0,0,320,160,0,0,170393600,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,256,8,160]","[DT_BFLOAT16:(1,256,8,8)]",18874368,SelfAttention326FlashAttention327FlashAttention,FlashAttention,0,[],FlashAttention,,"[256, 256]",,170393600,16,8,256,256,160,128,320,170393600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8801290743763115,7399.925681977151,0.0004164249269358608,0.9999899570239393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +328,SelfAttention326-Attention_output-328,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SelfAttention326Attentionoutput328MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +329,SelfAttention326-Attention_layernorm-329,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention326Attentionlayernorm329YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,SelfAttention326Attentionlayernorm329YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +330,GatedSelfAttention-Linear330,"XlaEinsum(a=1x8x768,b=768x1280,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear330XLinearcontext,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1998848,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,1280]","[DT_BFLOAT16:(1,8,1280)]",15728640,GatedSelfAttentionLinear330XLinearcontext,Einsum,1966080,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 1280], [1, 8, 1280]]",1,1998848,15,1,8,1280,768,0,272,1998848,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.45728,3723.14453125,0.014883720930232559,0.5031276393581081,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-Q-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Q331MatMulQ,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330Q331MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-K-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330K331MatMulK,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330K331MatMulK,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +331,GatedSelfAttention-Attn330-V-331,"XlaEinsum(a=1x264x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330V331MatMulV,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,264,8,160)]",865075200,GatedSelfAttentionAttn330V331MatMulV,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 264, 1280], [1280, 8, 160], [1, 264, 8, 160]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +332,GatedSelfAttention-Attn330-FlashAttention-332,"FlashAttention(q=1x264x8x160,k=1x264x8x160,v=1x264x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn330FlashAttention332FlashAttention,MXU,1,Memory,22796,1088,22796,0,0,0,0,0,0,0,0,1088,264,0,0,181125120,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[1,264,8,160]","[DT_BFLOAT16:(1,264,8,8)]",20072448,GatedSelfAttentionAttn330FlashAttention332FlashAttention,FlashAttention,0,[],FlashAttention,,"[264, 264]",,181125120,64,8,264,264,160,136,1088,181125120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2846,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.8805250043867345,7399.80317099217,0.00041661225755640525,0.9999734014854283,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +333,GatedSelfAttention-Attn330-Attention_output-333,"XlaEinsum(a=1x264x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,MXU,1,Compute,832,832,583,0,0,0,0,0,0,0,0,832,100,0,0,4628480,"DT_BFLOAT16:[1,264,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,264,1280)]",865075200,GatedSelfAttentionAttn330Attentionoutput333MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 264, 8, 160], [8, 160, 1280], [1, 264, 1280]]",1,4628480,50,1,264,1280,1280,0,832,4628480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,273,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1039.753846153846,5181.019122783954,0.49194991055456166,0.7001377192951289,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +334,GatedSelfAttention-Attn330-Attention_layernorm-334,"LayerNorm(x=1x264x1280,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,VPU,1,Memory,500,165,500,0,0,0,0,0,0,0,0,0,165,0,0,1351680,"DT_BFLOAT16:[1,264,1280]","[DT_BFLOAT16:(1,264,1280)]",2703360,GatedSelfAttentionAttn330Attentionlayernorm334YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,165,1351680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.40672,2517.7001953125,0.33,0.3402297561233108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +335,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +336,GatedSelfAttention-FFN330Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,GatedSelfAttentionFFN330FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +337,GatedSelfAttention-FFN330Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,GatedSelfAttentionFFN330FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +338,BasicTransformerBlock-Fuser_output_layernorm338,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockFuseroutputlayernorm338XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-Q-339,"XlaEinsum(a=1x256x1280,b=1280x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Q339MatMulQ,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,8,160]","[DT_BFLOAT16:(1,256,8,160)]",838860800,CrossAttention339Q339MatMulQ,Einsum,3276800,[],Einsum,"BLM,MND->BLND","[[1, 256, 1280], [1280, 8, 160], [1, 256, 8, 160]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-K-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339K339MatMulK,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339K339MatMulK,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +339,CrossAttention339-V-339,"XlaEinsum(a=1x512x768,b=768x8x160,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339V339MatMulV,MXU,1,Compute,512,512,512,0,0,0,0,0,0,0,0,512,60,0,0,4063232,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,160]","[DT_BFLOAT16:(1,512,8,160)]",1006632960,CrossAttention339V339MatMulV,Einsum,1966080,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 160], [1, 512, 8, 160]]",1,4063232,30,1,512,1280,768,0,512,4063232,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1966.08,7390.9759521484375,0.9302325581395349,0.9987805340741132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +340,CrossAttention339-FlashAttention-340,"FlashAttention(q=1x256x8x160,k=1x512x8x160,v=1x512x8x160,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention339FlashAttention340FlashAttention,MXU,1,Compute,576,576,500,0,0,0,0,0,0,0,0,576,320,0,0,3932160,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[1,512,8,160],DT_BFLOAT16:[1,512,8,160]","[DT_BFLOAT16:(1,256,8,8)]",37748736,CrossAttention339FlashAttention340FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,933888,32,8,512,256,160,256,576,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.536,6357.828776041667,0.031007751937984496,0.8591660508164415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +341,CrossAttention339-Attention_output-341,"XlaEinsum(a=1x256x8x160,b=8x160x1280,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,8,160],DT_BFLOAT16:[8,160,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,CrossAttention339Attentionoutput341MatMulattnOutputattnAvgWo,Einsum,3276800,[],Einsum,"BLND,NDM->BLM","[[1, 256, 8, 160], [8, 160, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +342,CrossAttention339-Attention_layernorm-342,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention339Attentionlayernorm342YnormLayerNormy,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,CrossAttention339Attentionlayernorm342YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +343,BasicTransformerBlock-Attn_output_layernorm343,"LayerNorm(x=1x256x1280,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,VPU,1,Memory,500,160,500,0,0,0,0,0,0,0,0,0,160,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",2621440,BasicTransformerBlockAttnoutputlayernorm343XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,160,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.24288,2441.40625,0.32,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +344,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x1280,b=1280x5120,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,5120]","[DT_BFLOAT16:(1,256,5120)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,13107200,[],Einsum,"BLM,MH->BLH","[[1, 256, 1280], [1280, 5120], [1, 256, 5120]]",1,16384000,100,1,256,5120,1280,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +345,BasicTransformerBlock-FFN344Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x256x5120,b=5120x1280,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Memory,2062,1632,2062,0,0,0,0,0,0,0,0,1632,200,0,0,16384000,"DT_BFLOAT16:[1,256,5120],DT_BFLOAT16:[5120,1280]","[DT_BFLOAT16:(1,256,1280)]",3355443200,BasicTransformerBlockFFN344FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,13107200,[],Einsum,"BLH,HM->BLM","[[1, 256, 5120], [5120, 1280], [1, 256, 1280]]",1,6946816,100,1,256,1280,5120,0,1632,16384000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,640,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1627.2760426770126,7399.994695683802,0.7699306009819622,0.9999992832005138,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +346,BasicTransformerBlock-FFN344Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x256x1280,b=1x256x1280,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,20,500,0,0,0,0,0,0,0,0,0,20,0,0,1310720,"DT_BFLOAT16:[1,256,1280]","[DT_BFLOAT16:(1,256,1280)]",327680,BasicTransformerBlockFFN344FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,20,1310720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.65536,2441.40625,0.04,0.3299197635135135,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +347,SpatialTransformer-Proj_out347,"XlaEinsum(a=1x256x1280,b=1280x1280,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout347einsum,MXU,1,Memory,578,432,578,0,0,0,0,0,0,0,0,432,50,0,0,4587520,"DT_BFLOAT16:[1,256,1280],DT_BFLOAT16:[1280,1280]","[DT_BFLOAT16:(1,256,1280)]",838860800,SpatialTransformerProjout347einsum,Einsum,3276800,[],Einsum,"BSN,NC->BSC","[[1, 256, 1280], [1280, 1280], [1, 256, 1280]]",1,4587520,25,1,256,1280,1280,0,432,4587520,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,173,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1451.3162629757785,7391.800929930796,0.6866768595263002,0.9988920175582157,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample348,"Upsample(a=1x1280x16x16,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample348Upsample,VPU,1,Memory,500,0,500,0,0,0,0,0,0,0,0,0,0,0,0,3276800,"DT_BFLOAT16:[1,1280,16,16]","[DT_BFLOAT16:(1,1280,32,32)]",0,Upsample348Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,3276800,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,6103.515625,0.0,0.8247994087837838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +348,Upsample-Conv2d348Conv2d,"Conv2D(a=1x1280x16x16,b=1280x1280x3x3,c=1x1280x16x16,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d348Conv2dconv2d,MXU,1,Memory,3877,3632,3877,0,0,0,0,0,0,0,0,3632,450,0,0,30801920,"DT_BFLOAT16:[1,1280,16,16],DT_BFLOAT16:[1280,1280,3,3]","[DT_BFLOAT16:(1,1280,16,16)]",7549747200,UpsampleConv2d348Conv2dconv2d,Conv2D,29491200,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 16, 16], [1280, 1280, 3, 3], [1, 1280, 16, 16]]",1,30976000,225,1,1280,256,11520,0,3632,30801920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1947.3167913335053,7399.154871679133,0.9213549195913887,0.9998857934701532,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +349,Time-Embed-MLP-Einsum349,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum349einsum,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum349einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1642240,15,1,1,640,1280,0,272,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2768,3058.910369873047,0.0015503875968992248,0.41336626619906036,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +350,Conv2d-GroupNorm350,"GroupNorm(x=1x1920x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm350XnormGroupNormX,VPU,1,Memory,990,960,990,0,0,0,0,0,0,0,0,0,960,0,0,7864320,"DT_BFLOAT16:[1,1920,32,32]","[DT_BFLOAT16:(1,1920,32,32)]",15728640,Conv2dGroupNorm350XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,960,7864320,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,351,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +351,Conv2d350Conv2d,"Conv2D(a=1x1920x32x32,b=1920x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d350Conv2dconv2d,MXU,1,Compute,13088,13088,3444,0,0,0,0,0,0,0,0,13088,1632,0,0,27361280,"DT_BFLOAT16:[1,1920,32,32],DT_BFLOAT16:[1920,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",22649241600,Conv2d350Conv2dconv2d,Conv2D,22118400,[],Conv2D,bf01;io01->bf01,"[[1, 1920, 32, 32], [1920, 640, 3, 3], [1, 640, 32, 32]]",1,27868160,816,1,640,1024,17280,0,13088,27361280,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3660,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1730.5349633251833,1946.987907577552,0.8187866037413999,0.2631064739969665,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +352,Conv2d-GroupNorm352,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm352XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm352XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +353,Conv2d352Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d352Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d352Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +354,SkipConnection-Einsum349,"XlaEinsum(a=1x32x32x1920,b=1920x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum349einsum,MXU,1,Compute,1568,1568,970,0,0,0,0,0,0,0,0,1568,192,0,0,7700480,"DT_BFLOAT16:[1,32,32,1920],DT_BFLOAT16:[1920,640]","[DT_BFLOAT16:(1,32,32,640)]",2516582400,SkipConnectionEinsum349einsum,Einsum,2457600,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1920], [1920, 640], [1, 32, 32, 640]]",1,7700480,96,1,1024,640,1920,0,1568,7700480,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1604.9632653061226,4573.744170519771,0.7593735168486,0.6180735365567258,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +355,SpatialTransformer-Input_GroupNorm355,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm355XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm355XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +356,SpatialTransformer-Proj_in356,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin356einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin356einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +357,BasicTransformerBlock-Input_layernorm357,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm357XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm357XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-Q-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Q358MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358Q358MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-K-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358K358MatMulK,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358K358MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +358,SelfAttention358-V-358,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358V358MatMulV,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention358V358MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +359,SelfAttention358-FlashAttention-359,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention358FlashAttention359FlashAttention,MXU,1,Compute,4160,4160,660,0,0,0,0,0,0,0,0,4160,2560,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention358FlashAttention359FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 256]",,1417216,256,8,1024,1024,80,2048,4160,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,72.59372307692308,1173.7530048076924,0.03434704830053668,0.15861527091995842,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +360,SelfAttention358-Attention_output-360,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention358Attentionoutput360MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +361,SelfAttention358-Attention_layernorm-361,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention358Attentionlayernorm361YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention358Attentionlayernorm361YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +362,GatedSelfAttention-Linear362,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear362XLinearcontext,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear362XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,9,1,8,640,768,0,176,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.0074418604651162795,0.25311031857052363,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-Q-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Q363MatMulQ,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362Q363MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-K-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362K363MatMulK,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362K363MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +363,GatedSelfAttention-Attn362-V-363,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362V363MatMulV,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn362V363MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +364,GatedSelfAttention-Attn362-FlashAttention-364,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn362FlashAttention364FlashAttention,MXU,1,Compute,6464,6464,665,0,0,0,0,0,0,0,0,6464,2880,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn362FlashAttention364FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 256]",,1427968,400,8,1032,1032,80,2080,6464,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1691,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,47.45156435643564,761.2870471312268,0.022451268564356433,0.10287662799070632,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +365,GatedSelfAttention-Attn362-Attention_output-365,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn362Attentionoutput365MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +366,GatedSelfAttention-Attn362-Attention_layernorm-366,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,VPU,1,Memory,500,323,500,0,0,0,0,0,0,0,0,0,323,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn362Attentionlayernorm366YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,323,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.56768,4920.95947265625,0.6449999999999999,0.6649945233319257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +367,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +368,GatedSelfAttention-FFN362Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN362FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +369,GatedSelfAttention-FFN362Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN362FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +370,BasicTransformerBlock-Fuser_output_layernorm370,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm370XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-Q-371,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Q371MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention371Q371MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-K-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371K371MatMulK,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371K371MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +371,CrossAttention371-V-371,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371V371MatMulV,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention371V371MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +372,CrossAttention371-FlashAttention-372,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention371FlashAttention372FlashAttention,MXU,1,Compute,2112,2112,500,0,0,0,0,0,0,0,0,2112,1280,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention371FlashAttention372FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,729088,128,8,512,1024,80,1024,2112,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,584,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,71.49381818181818,1733.9533025568182,0.03382663847780127,0.2343180138590295,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +373,CrossAttention371-Attention_output-373,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention371Attentionoutput373MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +374,CrossAttention371-Attention_layernorm-374,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention371Attentionlayernorm374YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention371Attentionlayernorm374YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +375,BasicTransformerBlock-Attn_output_layernorm375,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm375XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +376,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +377,BasicTransformerBlock-FFN376Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN376FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +378,BasicTransformerBlock-FFN376Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN376FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +379,SpatialTransformer-Proj_out379,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout379einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout379einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +380,Time-Embed-MLP-Einsum380,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum380einsum,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum380einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1642240,15,1,1,640,1280,0,272,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2768,3058.910369873047,0.0015503875968992248,0.41336626619906036,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +381,Conv2d-GroupNorm381,"GroupNorm(x=1x1280x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm381XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,1280,32,32]","[DT_BFLOAT16:(1,1280,32,32)]",10485760,Conv2dGroupNorm381XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +382,Conv2d381Conv2d,"Conv2D(a=1x1280x32x32,b=1280x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d381Conv2dconv2d,MXU,1,Compute,8672,8672,2351,0,0,0,0,0,0,0,0,8672,1080,0,0,18677760,"DT_BFLOAT16:[1,1280,32,32],DT_BFLOAT16:[1280,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",15099494400,Conv2d381Conv2dconv2d,Conv2D,14745600,[],Conv2D,bf01;io01->bf01,"[[1, 1280, 32, 32], [1280, 640, 3, 3], [1, 640, 32, 32]]",1,19015680,540,1,640,1024,11520,0,8672,18677760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1741.1778597785978,2005.883248529751,0.8238221917102893,0.27106530385537175,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +383,Conv2d-GroupNorm383,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm383XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm383XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +384,Conv2d383Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d383Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d383Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +385,SkipConnection-Einsum380,"XlaEinsum(a=1x32x32x1280,b=1280x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum380einsum,MXU,1,Compute,992,992,702,0,0,0,0,0,0,0,0,992,120,0,0,5570560,"DT_BFLOAT16:[1,32,32,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,32,32,640)]",1677721600,SkipConnectionEinsum380einsum,Einsum,1638400,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 1280], [1280, 640], [1, 32, 32, 640]]",1,5570560,60,1,1024,640,1280,0,992,5570560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,327,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1691.2516129032258,5229.826896421371,0.800200050012503,0.7067333643812663,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +386,SpatialTransformer-Input_GroupNorm386,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm386XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm386XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +387,SpatialTransformer-Proj_in387,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin387einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin387einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +388,BasicTransformerBlock-Input_layernorm388,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm388XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm388XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-Q-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Q389MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389Q389MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-K-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389K389MatMulK,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389K389MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +389,SelfAttention389-V-389,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389V389MatMulV,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention389V389MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +390,SelfAttention389-FlashAttention-390,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention389FlashAttention390FlashAttention,MXU,1,Compute,4160,4160,660,0,0,0,0,0,0,0,0,4160,2560,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention389FlashAttention390FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 256]",,1417216,256,8,1024,1024,80,2048,4160,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,72.59372307692308,1173.7530048076924,0.03434704830053668,0.15861527091995842,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +391,SelfAttention389-Attention_output-391,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention389Attentionoutput391MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +392,SelfAttention389-Attention_layernorm-392,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention389Attentionlayernorm392YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention389Attentionlayernorm392YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +393,GatedSelfAttention-Linear393,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear393XLinearcontext,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear393XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,9,1,8,640,768,0,176,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.0074418604651162795,0.25311031857052363,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-Q-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Q394MatMulQ,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393Q394MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-K-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393K394MatMulK,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393K394MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +394,GatedSelfAttention-Attn393-V-394,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393V394MatMulV,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn393V394MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +395,GatedSelfAttention-Attn393-FlashAttention-395,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn393FlashAttention395FlashAttention,MXU,1,Compute,6464,6464,665,0,0,0,0,0,0,0,0,6464,2880,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn393FlashAttention395FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 256]",,1427968,400,8,1032,1032,80,2080,6464,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1691,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,47.45156435643564,761.2870471312268,0.022451268564356433,0.10287662799070632,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +396,GatedSelfAttention-Attn393-Attention_output-396,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn393Attentionoutput396MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +397,GatedSelfAttention-Attn393-Attention_layernorm-397,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,VPU,1,Memory,500,323,500,0,0,0,0,0,0,0,0,0,323,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn393Attentionlayernorm397YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,323,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.56768,4920.95947265625,0.6449999999999999,0.6649945233319257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +398,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +399,GatedSelfAttention-FFN393Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN393FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +400,GatedSelfAttention-FFN393Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN393FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +401,BasicTransformerBlock-Fuser_output_layernorm401,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm401XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-Q-402,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Q402MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention402Q402MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-K-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402K402MatMulK,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402K402MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +402,CrossAttention402-V-402,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402V402MatMulV,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention402V402MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +403,CrossAttention402-FlashAttention-403,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention402FlashAttention403FlashAttention,MXU,1,Compute,2112,2112,500,0,0,0,0,0,0,0,0,2112,1280,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention402FlashAttention403FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,729088,128,8,512,1024,80,1024,2112,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,584,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,71.49381818181818,1733.9533025568182,0.03382663847780127,0.2343180138590295,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +404,CrossAttention402-Attention_output-404,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention402Attentionoutput404MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +405,CrossAttention402-Attention_layernorm-405,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention402Attentionlayernorm405YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention402Attentionlayernorm405YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +406,BasicTransformerBlock-Attn_output_layernorm406,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm406XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +407,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +408,BasicTransformerBlock-FFN407Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN407FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +409,BasicTransformerBlock-FFN407Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN407FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +410,SpatialTransformer-Proj_out410,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout410einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout410einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +411,Time-Embed-MLP-Einsum411,"XlaEinsum(a=1x1280,b=1280x640,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum411einsum,MXU,1,Memory,500,272,500,0,0,0,0,0,0,0,0,272,30,0,0,1642240,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,640]","[DT_BFLOAT16:(1,640)]",1638400,TimeEmbedMLPEinsum411einsum,Einsum,1638400,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 640], [1, 640]]",1,1642240,15,1,1,640,1280,0,272,1642240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,3.2768,3058.910369873047,0.0015503875968992248,0.41336626619906036,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +412,Conv2d-GroupNorm412,"GroupNorm(x=1x960x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm412XnormGroupNormX,VPU,1,Memory,500,480,500,0,0,0,0,0,0,0,0,0,480,0,0,3932160,"DT_BFLOAT16:[1,960,32,32]","[DT_BFLOAT16:(1,960,32,32)]",7864320,Conv2dGroupNorm412XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,480,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,7324.21875,0.96,0.9897592905405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +413,Conv2d412Conv2d,"Conv2D(a=1x960x32x32,b=960x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d412Conv2dconv2d,MXU,1,Compute,6560,6560,1805,0,0,0,0,0,0,0,0,6560,816,0,0,14336000,"DT_BFLOAT16:[1,960,32,32],DT_BFLOAT16:[960,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",11324620800,Conv2d412Conv2dconv2d,Conv2D,11059200,[],Conv2D,bf01;io01->bf01,"[[1, 960, 32, 32], [960, 640, 3, 3], [1, 640, 32, 32]]",1,14589440,408,1,640,1024,8640,0,6560,14336000,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1843,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1726.3141463414636,2035.2805533060214,0.8167895632444697,0.2750379126089218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +414,Conv2d-GroupNorm414,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm414XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,Conv2dGroupNorm414XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +415,Conv2d414Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d414Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,Conv2d414Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +416,SkipConnection-Einsum411,"XlaEinsum(a=1x32x32x960,b=960x640,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum411einsum,MXU,1,Compute,800,800,568,0,0,0,0,0,0,0,0,800,96,0,0,4505600,"DT_BFLOAT16:[1,32,32,960],DT_BFLOAT16:[960,640]","[DT_BFLOAT16:(1,32,32,640)]",1258291200,SkipConnectionEinsum411einsum,Einsum,1228800,[],Einsum,"BHWC,CO->BHWO","[[1, 32, 32, 960], [960, 640], [1, 32, 32, 640]]",1,4505600,48,1,1024,640,960,0,800,4505600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,264,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1572.864,5245.208740234375,0.7441860465116279,0.7088119919235641,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +417,SpatialTransformer-Input_GroupNorm417,"GroupNorm(x=1x640x32x32,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm417XnormGroupNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,32,32)]",5242880,SpatialTransformerInputGroupNorm417XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +418,SpatialTransformer-Proj_in418,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin418einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjin418einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +419,BasicTransformerBlock-Input_layernorm419,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm419XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockInputlayernorm419XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-Q-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Q420MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420Q420MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-K-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420K420MatMulK,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420K420MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +420,SelfAttention420-V-420,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420V420MatMulV,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,SelfAttention420V420MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +421,SelfAttention420-FlashAttention-421,"FlashAttention(q=1x1024x8x80,k=1x1024x8x80,v=1x1024x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention420FlashAttention421FlashAttention,MXU,1,Compute,4160,4160,660,0,0,0,0,0,0,0,0,4160,2560,0,0,5242880,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,1024,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",301989888,SelfAttention420FlashAttention421FlashAttention,FlashAttention,0,[],FlashAttention,,"[1024, 256]",,1417216,256,8,1024,1024,80,2048,4160,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,72.59372307692308,1173.7530048076924,0.03434704830053668,0.15861527091995842,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +422,SelfAttention420-Attention_output-422,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SelfAttention420Attentionoutput422MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +423,SelfAttention420-Attention_layernorm-423,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention420Attentionlayernorm423YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,SelfAttention420Attentionlayernorm423YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +424,GatedSelfAttention-Linear424,"XlaEinsum(a=1x8x768,b=768x640,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear424XLinearcontext,MXU,1,Memory,500,176,500,0,0,0,0,0,0,0,0,176,18,0,0,1005568,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,640]","[DT_BFLOAT16:(1,8,640)]",7864320,GatedSelfAttentionLinear424XLinearcontext,Einsum,983040,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 640], [1, 8, 640]]",1,1005568,9,1,8,640,768,0,176,1005568,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.72864,1873.016357421875,0.0074418604651162795,0.25311031857052363,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-Q-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Q425MatMulQ,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424Q425MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-K-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424K425MatMulK,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424K425MatMulK,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +425,GatedSelfAttention-Attn424-V-425,"XlaEinsum(a=1x1032x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424V425MatMulV,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1032,8,80)]",845414400,GatedSelfAttentionAttn424V425MatMulV,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1032, 640], [640, 8, 80], [1, 1032, 8, 80]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +426,GatedSelfAttention-Attn424-FlashAttention-426,"FlashAttention(q=1x1032x8x80,k=1x1032x8x80,v=1x1032x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn424FlashAttention426FlashAttention,MXU,1,Compute,6464,6464,665,0,0,0,0,0,0,0,0,6464,2880,0,0,5283840,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[1,1032,8,80]","[DT_BFLOAT16:(1,1032,8,8)]",306726912,GatedSelfAttentionAttn424FlashAttention426FlashAttention,FlashAttention,0,[],FlashAttention,,"[1032, 256]",,1427968,400,8,1032,1032,80,2080,6464,5283840,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1691,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,47.45156435643564,761.2870471312268,0.022451268564356433,0.10287662799070632,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +427,GatedSelfAttention-Attn424-Attention_output-427,"XlaEinsum(a=1x1032x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,MXU,1,Compute,752,752,500,0,0,0,0,0,0,0,0,752,90,0,0,3461120,"DT_BFLOAT16:[1,1032,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1032,640)]",845414400,GatedSelfAttentionAttn424Attentionoutput427MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1032, 8, 80], [8, 80, 640], [1, 1032, 640]]",1,3461120,45,1,1032,640,640,0,752,3461120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1124.2212765957447,4286.4616881025595,0.5319148936170213,0.5792515794733188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +428,GatedSelfAttention-Attn424-Attention_layernorm-428,"LayerNorm(x=1x1032x640,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,VPU,1,Memory,500,323,500,0,0,0,0,0,0,0,0,0,323,0,0,2641920,"DT_BFLOAT16:[1,1032,640]","[DT_BFLOAT16:(1,1032,640)]",5283840,GatedSelfAttentionAttn424Attentionlayernorm428YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,323,2641920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,137,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.56768,4920.95947265625,0.6449999999999999,0.6649945233319257,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +429,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +430,GatedSelfAttention-FFN424Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,GatedSelfAttentionFFN424FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +431,GatedSelfAttention-FFN424Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,GatedSelfAttentionFFN424FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +432,BasicTransformerBlock-Fuser_output_layernorm432,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockFuseroutputlayernorm432XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-Q-433,"XlaEinsum(a=1x1024x640,b=640x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Q433MatMulQ,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,8,80]","[DT_BFLOAT16:(1,1024,8,80)]",838860800,CrossAttention433Q433MatMulQ,Einsum,819200,[],Einsum,"BLM,MND->BLND","[[1, 1024, 640], [640, 8, 80], [1, 1024, 8, 80]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-K-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433K433MatMulK,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433K433MatMulK,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +433,CrossAttention433-V-433,"XlaEinsum(a=1x512x768,b=768x8x80,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433V433MatMulV,MXU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,320,36,0,0,2424832,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,80]","[DT_BFLOAT16:(1,512,8,80)]",503316480,CrossAttention433V433MatMulV,Einsum,983040,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 80], [1, 512, 8, 80]]",1,2424832,18,1,512,640,768,0,320,2424832,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1006.63296,4516.6015625,0.4762790697674419,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +434,CrossAttention433-FlashAttention-434,"FlashAttention(q=1x1024x8x80,k=1x512x8x80,v=1x512x8x80,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention433FlashAttention434FlashAttention,MXU,1,Compute,2112,2112,500,0,0,0,0,0,0,0,0,2112,1280,0,0,3932160,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[1,512,8,80],DT_BFLOAT16:[1,512,8,80]","[DT_BFLOAT16:(1,1024,8,8)]",150994944,CrossAttention433FlashAttention434FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,729088,128,8,512,1024,80,1024,2112,3932160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,584,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,71.49381818181818,1733.9533025568182,0.03382663847780127,0.2343180138590295,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +435,CrossAttention433-Attention_output-435,"XlaEinsum(a=1x1024x8x80,b=8x80x640,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,8,80],DT_BFLOAT16:[8,80,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,CrossAttention433Attentionoutput435MatMulattnOutputattnAvgWo,Einsum,819200,[],Einsum,"BLND,NDM->BLM","[[1, 1024, 8, 80], [8, 80, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +436,CrossAttention433-Attention_layernorm-436,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention433Attentionlayernorm436YnormLayerNormy,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,CrossAttention433Attentionlayernorm436YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +437,BasicTransformerBlock-Attn_output_layernorm437,"LayerNorm(x=1x1024x640,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,VPU,1,Memory,500,320,500,0,0,0,0,0,0,0,0,0,320,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",5242880,BasicTransformerBlockAttnoutputlayernorm437XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,320,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,136,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,10.48576,4882.8125,0.64,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +438,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x640,b=640x2560,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,2560]","[DT_BFLOAT16:(1,1024,2560)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,3276800,[],Einsum,"BLM,MH->BLH","[[1, 1024, 640], [640, 2560], [1, 1024, 2560]]",1,9830400,120,1,1024,2560,640,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +439,BasicTransformerBlock-FFN438Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x1024x2560,b=2560x640,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,1952,1952,1238,0,0,0,0,0,0,0,0,1952,240,0,0,9830400,"DT_BFLOAT16:[1,1024,2560],DT_BFLOAT16:[2560,640]","[DT_BFLOAT16:(1,1024,640)]",3355443200,BasicTransformerBlockFFN438FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,3276800,[],Einsum,"BLH,HM->BLM","[[1, 1024, 2560], [2560, 640], [1, 1024, 640]]",1,8126464,120,1,1024,640,2560,0,1952,9830400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1718.9770491803276,4690.201556096312,0.8133180836192654,0.6338110210940961,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +440,BasicTransformerBlock-FFN438Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x1024x640,b=1x1024x640,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,500,40,500,0,0,0,0,0,0,0,0,0,40,0,0,2621440,"DT_BFLOAT16:[1,1024,640]","[DT_BFLOAT16:(1,1024,640)]",655360,BasicTransformerBlockFFN438FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,40,2621440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.31072,4882.8125,0.08,0.659839527027027,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +441,SpatialTransformer-Proj_out441,"XlaEinsum(a=1x1024x640,b=640x640,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout441einsum,MXU,1,Compute,608,608,500,0,0,0,0,0,0,0,0,608,72,0,0,3440640,"DT_BFLOAT16:[1,1024,640],DT_BFLOAT16:[640,640]","[DT_BFLOAT16:(1,1024,640)]",838860800,SpatialTransformerProjout441einsum,Einsum,819200,[],Einsum,"BSN,NC->BSC","[[1, 1024, 640], [640, 640], [1, 1024, 640]]",1,3440640,36,1,1024,640,640,0,608,3440640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1379.7052631578947,5270.305432771382,0.6527947776417788,0.7122034368609975,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample442,"Upsample(a=1x640x32x32,scale_factor=2,memory_placements=0_0_0,type=DT_BFLOAT16)",Upsample442Upsample,VPU,1,Memory,825,0,825,0,0,0,0,0,0,0,0,0,0,0,0,6553600,"DT_BFLOAT16:[1,640,32,32]","[DT_BFLOAT16:(1,640,64,64)]",0,Upsample442Upsample,Upsample,0,[],Upsample,,,,,0,,,,,0,0,6553600,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,7398.200757575758,0.0,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +442,Upsample-Conv2d442Conv2d,"Conv2D(a=1x640x32x32,b=640x640x3x3,c=1x640x32x32,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",UpsampleConv2d442Conv2dconv2d,MXU,1,Compute,4448,4448,1258,0,0,0,0,0,0,0,0,4448,552,0,0,9994240,"DT_BFLOAT16:[1,640,32,32],DT_BFLOAT16:[640,640,3,3]","[DT_BFLOAT16:(1,640,32,32)]",7549747200,UpsampleConv2d442Conv2dconv2d,Conv2D,7372800,[],Conv2D,bf01;io01->bf01,"[[1, 640, 32, 32], [640, 640, 3, 3], [1, 640, 32, 32]]",1,10163200,276,1,640,1024,5760,0,4448,9994240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1254,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1697.3352517985613,2092.5947230496854,0.8030784674585913,0.2827830706823899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +443,Time-Embed-MLP-Einsum443,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum443einsum,MXU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,192,20,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum443einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,822400,10,1,1,320,1280,0,192,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0007751937984496124,0.20700532036858635,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +444,Conv2d-GroupNorm444,"GroupNorm(x=1x960x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm444XnormGroupNormX,VPU,1,Memory,1980,1920,1980,0,0,0,0,0,0,0,0,0,1920,0,0,15728640,"DT_BFLOAT16:[1,960,64,64]","[DT_BFLOAT16:(1,960,64,64)]",31457280,Conv2dGroupNorm444XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1920,15728640,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,703,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +445,Conv2d444Conv2d,"Conv2D(a=1x960x64x64,b=960x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d444Conv2dconv2d,MXU,1,Compute,17440,17440,2016,0,0,0,0,0,0,0,0,17440,2176,0,0,16015360,"DT_BFLOAT16:[1,960,64,64],DT_BFLOAT16:[960,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",22649241600,Conv2d444Conv2dconv2d,Conv2D,5529600,[],Conv2D,bf01;io01->bf01,"[[1, 960, 64, 64], [960, 320, 3, 3], [1, 320, 64, 64]]",1,16514560,1088,1,320,4096,8640,0,17440,16015360,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4587,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1298.6950458715598,855.2446277863389,0.6144655429912524,0.11557359834950526,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +446,Conv2d-GroupNorm446,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm446XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm446XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +447,Conv2d446Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d446Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d446Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +448,SkipConnection-Einsum443,"XlaEinsum(a=1x64x64x960,b=960x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum443einsum,MXU,1,Compute,2080,2080,1398,0,0,0,0,0,0,0,0,2080,256,0,0,11100160,"DT_BFLOAT16:[1,64,64,960],DT_BFLOAT16:[960,320]","[DT_BFLOAT16:(1,64,64,320)]",2516582400,SkipConnectionEinsum443einsum,Einsum,614400,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 960], [960, 320], [1, 64, 64, 320]]",1,11100160,128,1,4096,320,960,0,2080,11100160,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,677,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1209.8953846153845,4970.110379732572,0.5724508050089445,0.671636537801699,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +449,SpatialTransformer-Input_GroupNorm449,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm449XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm449XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +450,SpatialTransformer-Proj_in450,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin450einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin450einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +451,BasicTransformerBlock-Input_layernorm451,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm451XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm451XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-Q-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Q452MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452Q452MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-K-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452K452MatMulK,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452K452MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +452,SelfAttention452-V-452,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452V452MatMulV,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention452V452MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +453,SelfAttention452-FlashAttention-453,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention452FlashAttention453FlashAttention,MXU,1,Compute,65600,65600,1320,0,0,0,0,0,0,0,0,65600,40960,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention452FlashAttention453FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,4870144,4096,8,4096,4096,40,32768,65600,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16549,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.65607024390243,148.86623475609755,0.03484968803176403,0.020117058750823993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +454,SelfAttention452-Attention_output-454,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention452Attentionoutput454MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +455,SelfAttention452-Attention_layernorm-455,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention452Attentionlayernorm455YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention452Attentionlayernorm455YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +456,GatedSelfAttention-Linear456,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear456XLinearcontext,MXU,1,Memory,500,30,500,0,0,0,0,0,0,0,0,0,30,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear456XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,6,1,8,320,768,0,30,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.0037209302325581397,0.12810165817673141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-Q-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Q457MatMulQ,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456Q457MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-K-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456K457MatMulK,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456K457MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +457,GatedSelfAttention-Attn456-V-457,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456V457MatMulV,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn456V457MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +458,GatedSelfAttention-Attn456-FlashAttention-458,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn456FlashAttention458FlashAttention,MXU,1,Compute,74048,74048,1323,0,0,0,0,0,0,0,0,74048,42144,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn456FlashAttention458FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 256]",,4879616,4624,8,4104,4104,40,32896,74048,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18661,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.50792739844425,132.1399428253042,0.030994469646338767,0.01785674903044651,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +459,GatedSelfAttention-Attn456-Attention_output-459,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn456Attentionoutput459MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +460,GatedSelfAttention-Attn456-Attention_layernorm-460,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,VPU,1,Memory,662,642,662,0,0,0,0,0,0,0,0,0,642,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn456Attentionlayernorm460YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,642,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.870453172205437,7390.255654326378,0.968655589123867,0.9986831965305917,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +461,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +462,GatedSelfAttention-FFN456Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN456FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +463,GatedSelfAttention-FFN456Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN456FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +464,BasicTransformerBlock-Fuser_output_layernorm464,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm464XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-Q-465,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Q465MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention465Q465MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-K-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465K465MatMulK,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465K465MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +465,CrossAttention465-V-465,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465V465MatMulV,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention465V465MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +466,CrossAttention465-FlashAttention-466,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention465FlashAttention466FlashAttention,MXU,1,Compute,8256,8256,743,0,0,0,0,0,0,0,0,8256,5120,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention465FlashAttention466FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,626688,512,8,512,4096,40,4096,8256,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.15646511627907,665.3541742369187,0.03461330448891292,0.08991272624823225,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +467,CrossAttention465-Attention_output-467,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention465Attentionoutput467MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +468,CrossAttention465-Attention_layernorm-468,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention465Attentionlayernorm468YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention465Attentionlayernorm468YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +469,BasicTransformerBlock-Attn_output_layernorm469,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm469XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +470,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +471,BasicTransformerBlock-FFN470Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN470FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +472,BasicTransformerBlock-FFN470Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN470FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +473,SpatialTransformer-Proj_out473,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout473einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout473einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +474,Time-Embed-MLP-Einsum474,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum474einsum,MXU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,192,20,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum474einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,822400,10,1,1,320,1280,0,192,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0007751937984496124,0.20700532036858635,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +475,Conv2d-GroupNorm475,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm475XnormGroupNormX,VPU,1,Memory,1320,1280,1320,0,0,0,0,0,0,0,0,0,1280,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm475XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1280,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,469,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +476,Conv2d475Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d475Conv2dconv2d,MXU,1,Compute,11808,11808,1454,0,0,0,0,0,0,0,0,11808,1472,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d475Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,736,1,320,4096,5760,0,11808,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1278.751219512195,911.0303429084096,0.6050293061070144,0.12311220850113642,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +477,Conv2d-GroupNorm477,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm477XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm477XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +478,Conv2d477Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d477Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d477Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +479,SkipConnection-Einsum474,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum474einsum,MXU,1,Compute,1568,1568,1042,0,0,0,0,0,0,0,0,1568,192,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum474einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,96,1,4096,320,640,0,1568,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1069.9755102040817,4914.3421406648595,0.5062490112324,0.6641002892790351,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +480,SpatialTransformer-Input_GroupNorm480,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm480XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm480XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +481,SpatialTransformer-Proj_in481,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin481einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin481einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +482,BasicTransformerBlock-Input_layernorm482,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm482XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm482XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-Q-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Q483MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483Q483MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-K-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483K483MatMulK,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483K483MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +483,SelfAttention483-V-483,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483V483MatMulV,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention483V483MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +484,SelfAttention483-FlashAttention-484,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention483FlashAttention484FlashAttention,MXU,1,Compute,65600,65600,1320,0,0,0,0,0,0,0,0,65600,40960,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention483FlashAttention484FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,4870144,4096,8,4096,4096,40,32768,65600,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16549,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.65607024390243,148.86623475609755,0.03484968803176403,0.020117058750823993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +485,SelfAttention483-Attention_output-485,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention483Attentionoutput485MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +486,SelfAttention483-Attention_layernorm-486,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention483Attentionlayernorm486YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention483Attentionlayernorm486YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +487,GatedSelfAttention-Linear487,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear487XLinearcontext,MXU,1,Memory,500,30,500,0,0,0,0,0,0,0,0,0,30,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear487XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,6,1,8,320,768,0,30,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.0037209302325581397,0.12810165817673141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-Q-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Q488MatMulQ,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487Q488MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-K-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487K488MatMulK,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487K488MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +488,GatedSelfAttention-Attn487-V-488,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487V488MatMulV,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn487V488MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +489,GatedSelfAttention-Attn487-FlashAttention-489,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn487FlashAttention489FlashAttention,MXU,1,Compute,74048,74048,1323,0,0,0,0,0,0,0,0,74048,42144,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn487FlashAttention489FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 256]",,4879616,4624,8,4104,4104,40,32896,74048,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18661,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.50792739844425,132.1399428253042,0.030994469646338767,0.01785674903044651,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +490,GatedSelfAttention-Attn487-Attention_output-490,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn487Attentionoutput490MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +491,GatedSelfAttention-Attn487-Attention_layernorm-491,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,VPU,1,Memory,662,642,662,0,0,0,0,0,0,0,0,0,642,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn487Attentionlayernorm491YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,642,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.870453172205437,7390.255654326378,0.968655589123867,0.9986831965305917,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +492,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +493,GatedSelfAttention-FFN487Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN487FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +494,GatedSelfAttention-FFN487Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN487FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +495,BasicTransformerBlock-Fuser_output_layernorm495,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm495XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-Q-496,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Q496MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention496Q496MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-K-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496K496MatMulK,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496K496MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +496,CrossAttention496-V-496,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496V496MatMulV,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention496V496MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +497,CrossAttention496-FlashAttention-497,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention496FlashAttention497FlashAttention,MXU,1,Compute,8256,8256,743,0,0,0,0,0,0,0,0,8256,5120,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention496FlashAttention497FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,626688,512,8,512,4096,40,4096,8256,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.15646511627907,665.3541742369187,0.03461330448891292,0.08991272624823225,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +498,CrossAttention496-Attention_output-498,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention496Attentionoutput498MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +499,CrossAttention496-Attention_layernorm-499,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention496Attentionlayernorm499YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention496Attentionlayernorm499YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +500,BasicTransformerBlock-Attn_output_layernorm500,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm500XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +501,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +502,BasicTransformerBlock-FFN501Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN501FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +503,BasicTransformerBlock-FFN501Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN501FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +504,SpatialTransformer-Proj_out504,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout504einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout504einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +505,Time-Embed-MLP-Einsum505,"XlaEinsum(a=1x1280,b=1280x320,eq=BT;TD->BD,memory_placements=0_0_0,type=DT_BFLOAT16)",TimeEmbedMLPEinsum505einsum,MXU,1,Memory,500,192,500,0,0,0,0,0,0,0,0,192,20,0,0,822400,"DT_BFLOAT16:[1,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,320)]",819200,TimeEmbedMLPEinsum505einsum,Einsum,819200,[],Einsum,"BT,TD->BD","[[1, 1280], [1280, 320], [1, 320]]",1,822400,10,1,1,320,1280,0,192,822400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.6384,1531.839370727539,0.0007751937984496124,0.20700532036858635,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +506,Conv2d-GroupNorm506,"GroupNorm(x=1x640x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm506XnormGroupNormX,VPU,1,Memory,1320,1280,1320,0,0,0,0,0,0,0,0,0,1280,0,0,10485760,"DT_BFLOAT16:[1,640,64,64]","[DT_BFLOAT16:(1,640,64,64)]",20971520,Conv2dGroupNorm506XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,1280,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,469,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +507,Conv2d506Conv2d,"Conv2D(a=1x640x64x64,b=640x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d506Conv2dconv2d,MXU,1,Compute,11808,11808,1454,0,0,0,0,0,0,0,0,11808,1472,0,0,11550720,"DT_BFLOAT16:[1,640,64,64],DT_BFLOAT16:[640,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",15099494400,Conv2d506Conv2dconv2d,Conv2D,3686400,[],Conv2D,bf01;io01->bf01,"[[1, 640, 64, 64], [640, 320, 3, 3], [1, 320, 64, 64]]",1,11883520,736,1,320,4096,5760,0,11808,11550720,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1278.751219512195,911.0303429084096,0.6050293061070144,0.12311220850113642,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +508,Conv2d-GroupNorm508,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Conv2dGroupNorm508XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Conv2dGroupNorm508XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +509,Conv2d508Conv2d,"Conv2D(a=1x320x64x64,b=320x320x3x3,c=1x320x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Conv2d508Conv2dconv2d,MXU,1,Compute,6176,6176,892,0,0,0,0,0,0,0,0,6176,768,0,0,7086080,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,320,3,3]","[DT_BFLOAT16:(1,320,64,64)]",7549747200,Conv2d508Conv2dconv2d,Conv2D,1843200,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 320, 3, 3], [1, 320, 64, 64]]",1,7252480,384,1,320,4096,2880,0,6176,7086080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1644,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1222.4331606217618,1068.5599529681429,0.5783829377033378,0.14439999364434364,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +510,SkipConnection-Einsum505,"XlaEinsum(a=1x64x64x640,b=640x320,eq=BHWC;CO->BHWO,memory_placements=0_0_0,type=DT_BFLOAT16)",SkipConnectionEinsum505einsum,MXU,1,Compute,1568,1568,1042,0,0,0,0,0,0,0,0,1568,192,0,0,8273920,"DT_BFLOAT16:[1,64,64,640],DT_BFLOAT16:[640,320]","[DT_BFLOAT16:(1,64,64,320)]",1677721600,SkipConnectionEinsum505einsum,Einsum,409600,[],Einsum,"BHWC,CO->BHWO","[[1, 64, 64, 640], [640, 320], [1, 64, 64, 320]]",1,8273920,96,1,4096,320,640,0,1568,8273920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1069.9755102040817,4914.3421406648595,0.5062490112324,0.6641002892790351,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +511,SpatialTransformer-Input_GroupNorm511,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",SpatialTransformerInputGroupNorm511XnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,SpatialTransformerInputGroupNorm511XnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +512,SpatialTransformer-Proj_in512,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjin512einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjin512einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +513,BasicTransformerBlock-Input_layernorm513,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockInputlayernorm513XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockInputlayernorm513XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-Q-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Q514MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514Q514MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-K-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514K514MatMulK,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514K514MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +514,SelfAttention514-V-514,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514V514MatMulV,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,SelfAttention514V514MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +515,SelfAttention514-FlashAttention-515,"FlashAttention(q=1x4096x8x40,k=1x4096x8x40,v=1x4096x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",SelfAttention514FlashAttention515FlashAttention,MXU,1,Compute,65600,65600,1320,0,0,0,0,0,0,0,0,65600,40960,0,0,10485760,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,4096,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",4831838208,SelfAttention514FlashAttention515FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,4870144,4096,8,4096,4096,40,32768,65600,10485760,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16549,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.65607024390243,148.86623475609755,0.03484968803176403,0.020117058750823993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +516,SelfAttention514-Attention_output-516,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SelfAttention514Attentionoutput516MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +517,SelfAttention514-Attention_layernorm-517,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",SelfAttention514Attentionlayernorm517YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,SelfAttention514Attentionlayernorm517YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +518,GatedSelfAttention-Linear518,"XlaEinsum(a=1x8x768,b=768x320,eq=BLM;MD->BLD,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionLinear518XLinearcontext,MXU,1,Memory,500,30,500,0,0,0,0,0,0,0,0,0,30,0,0,508928,"DT_BFLOAT16:[1,8,768],DT_BFLOAT16:[768,320]","[DT_BFLOAT16:(1,8,320)]",3932160,GatedSelfAttentionLinear518XLinearcontext,Einsum,491520,[],Einsum,"BLM,MD->BLD","[[1, 8, 768], [768, 320], [1, 8, 320]]",1,508928,6,1,8,320,768,0,30,508928,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.86432,947.9522705078125,0.0037209302325581397,0.12810165817673141,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-Q-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Q519MatMulQ,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518Q519MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-K-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518K519MatMulK,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518K519MatMulK,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +519,GatedSelfAttention-Attn518-V-519,"XlaEinsum(a=1x4104x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518V519MatMulV,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4104,8,40)]",840499200,GatedSelfAttentionAttn518V519MatMulV,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4104, 320], [320, 8, 40], [1, 4104, 8, 40]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +520,GatedSelfAttention-Attn518-FlashAttention-520,"FlashAttention(q=1x4104x8x40,k=1x4104x8x40,v=1x4104x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",GatedSelfAttentionAttn518FlashAttention520FlashAttention,MXU,1,Compute,74048,74048,1323,0,0,0,0,0,0,0,0,74048,42144,0,0,10506240,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[1,4104,8,40]","[DT_BFLOAT16:(1,4104,8,8)]",4850731008,GatedSelfAttentionAttn518FlashAttention520FlashAttention,FlashAttention,0,[],FlashAttention,,"[4104, 256]",,4879616,4624,8,4104,4104,40,32896,74048,10506240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,18661,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,65.50792739844425,132.1399428253042,0.030994469646338767,0.01785674903044651,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +521,GatedSelfAttention-Attn518-Attention_output-521,"XlaEinsum(a=1x4104x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,MXU,1,Compute,1120,1120,687,0,0,0,0,0,0,0,0,1120,136,0,0,5457920,"DT_BFLOAT16:[1,4104,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4104,320)]",840499200,GatedSelfAttentionAttn518Attentionoutput521MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4104, 8, 40], [8, 40, 320], [1, 4104, 320]]",1,5457920,68,1,4104,320,320,0,1120,5457920,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,750.4457142857143,4538.467952183315,0.35506644518272423,0.6133064800247723,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +522,GatedSelfAttention-Attn518-Attention_layernorm-522,"LayerNorm(x=1x4104x320,memory_placements=0_0,type=DT_BFLOAT16)",GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,VPU,1,Memory,662,642,662,0,0,0,0,0,0,0,0,0,642,0,0,5253120,"DT_BFLOAT16:[1,4104,320]","[DT_BFLOAT16:(1,4104,320)]",10506240,GatedSelfAttentionAttn518Attentionlayernorm522YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,642,5253120,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.870453172205437,7390.255654326378,0.968655589123867,0.9986831965305917,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +523,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +524,GatedSelfAttention-FFN518Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,GatedSelfAttentionFFN518FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +525,GatedSelfAttention-FFN518Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,GatedSelfAttentionFFN518FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +526,BasicTransformerBlock-Fuser_output_layernorm526,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockFuseroutputlayernorm526XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-Q-527,"XlaEinsum(a=1x4096x320,b=320x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Q527MatMulQ,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,8,40]","[DT_BFLOAT16:(1,4096,8,40)]",838860800,CrossAttention527Q527MatMulQ,Einsum,204800,[],Einsum,"BLM,MND->BLND","[[1, 4096, 320], [320, 8, 40], [1, 4096, 8, 40]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-K-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527K527MatMulK,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527K527MatMulK,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +527,CrossAttention527-V-527,"XlaEinsum(a=1x512x768,b=768x8x40,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527V527MatMulV,MXU,1,Memory,500,224,500,0,0,0,0,0,0,0,0,224,24,0,0,1605632,"DT_BFLOAT16:[1,512,768],DT_BFLOAT16:[768,8,40]","[DT_BFLOAT16:(1,512,8,40)]",251658240,CrossAttention527V527MatMulV,Einsum,491520,[],Einsum,"BLM,MND->BLND","[[1, 512, 768], [768, 8, 40], [1, 512, 8, 40]]",1,1605632,12,1,512,320,768,0,224,1605632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,503.31648,2990.72265625,0.23813953488372094,0.40415171030405406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +528,CrossAttention527-FlashAttention-528,"FlashAttention(q=1x4096x8x40,k=1x512x8x40,v=1x512x8x40,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",CrossAttention527FlashAttention528FlashAttention,MXU,1,Compute,8256,8256,743,0,0,0,0,0,0,0,0,8256,5120,0,0,5898240,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[1,512,8,40],DT_BFLOAT16:[1,512,8,40]","[DT_BFLOAT16:(1,4096,8,8)]",603979776,CrossAttention527FlashAttention528FlashAttention,FlashAttention,0,[],FlashAttention,,"[512, 256]",,626688,512,8,512,4096,40,4096,8256,5898240,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,73.15646511627907,665.3541742369187,0.03461330448891292,0.08991272624823225,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +529,CrossAttention527-Attention_output-529,"XlaEinsum(a=1x4096x8x40,b=8x40x320,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,8,40],DT_BFLOAT16:[8,40,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,CrossAttention527Attentionoutput529MatMulattnOutputattnAvgWo,Einsum,204800,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 8, 40], [8, 40, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +530,CrossAttention527-Attention_layernorm-530,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",CrossAttention527Attentionlayernorm530YnormLayerNormy,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,CrossAttention527Attentionlayernorm530YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +531,BasicTransformerBlock-Attn_output_layernorm531,"LayerNorm(x=1x4096x320,memory_placements=0_0,type=DT_BFLOAT16)",BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",10485760,BasicTransformerBlockAttnoutputlayernorm531XnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +532,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x320,b=320x1280,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,1280]","[DT_BFLOAT16:(1,4096,1280)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulh2ynorm2WFFi1,Einsum,819200,[],Einsum,"BLM,MH->BLH","[[1, 4096, 320], [320, 1280], [1, 4096, 1280]]",1,13926400,160,1,4096,1280,320,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +533,BasicTransformerBlock-FFN532Fwd-FFN_encoder-FFinput,"XlaEinsum(a=1x4096x1280,b=1280x320,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,MXU,1,Compute,2592,2592,1753,0,0,0,0,0,0,0,0,2592,320,0,0,13926400,"DT_BFLOAT16:[1,4096,1280],DT_BFLOAT16:[1280,320]","[DT_BFLOAT16:(1,4096,320)]",3355443200,BasicTransformerBlockFFN532FwdFFNencoderFFinputMatMulffn2h2WFFo1,Einsum,819200,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1280], [1280, 320], [1, 4096, 320]]",1,13926400,160,1,4096,320,1280,0,2592,13926400,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,845,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1294.5382716049382,5003.846721884645,0.6124988037132739,0.6761955029573845,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +534,BasicTransformerBlock-FFN532Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x320,b=1x4096x320,memory_placements=0_0_0,type=DT_BFLOAT16)",BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,1,Memory,660,80,660,0,0,0,0,0,0,0,0,0,80,0,0,5242880,"DT_BFLOAT16:[1,4096,320]","[DT_BFLOAT16:(1,4096,320)]",1310720,BasicTransformerBlockFFN532FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,80,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,94,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9859393939393941,7398.200757575758,0.12121212121212122,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +535,SpatialTransformer-Proj_out535,"XlaEinsum(a=1x4096x320,b=320x320,eq=BSN;NC->BSC,memory_placements=0_0_0,type=DT_BFLOAT16)",SpatialTransformerProjout535einsum,MXU,1,Compute,1056,1056,686,0,0,0,0,0,0,0,0,1056,128,0,0,5447680,"DT_BFLOAT16:[1,4096,320],DT_BFLOAT16:[320,320]","[DT_BFLOAT16:(1,4096,320)]",838860800,SpatialTransformerProjout535einsum,Einsum,204800,[],Einsum,"BSN,NC->BSC","[[1, 4096, 320], [320, 320], [1, 4096, 320]]",1,5447680,64,1,4096,320,320,0,1056,5447680,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,794.3757575757576,4804.4956091678505,0.3758515386422363,0.6492561634010608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +536,Out536-GroupNorm,"GroupNorm(x=1x320x64x64,memory_placements=0_0,type=DT_BFLOAT16)",Out536GroupNormXnormGroupNormX,VPU,1,Memory,660,640,660,0,0,0,0,0,0,0,0,0,640,0,0,5242880,"DT_BFLOAT16:[1,320,64,64]","[DT_BFLOAT16:(1,320,64,64)]",10485760,Out536GroupNormXnormGroupNormX,GroupNorm,0,[],GroupNorm,,,,,0,,,,,0,640,5242880,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.887515151515153,7398.200757575758,0.9696969696969697,0.9997568591318592,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +537,Out537-Conv2d,"Conv2D(a=1x320x64x64,b=320x3x3x3,c=1x3x64x64,eq=bf01;io01->bf01,window={size=3x3 stride=1x1 pad=0_1x0_1},memory_placements=0_0_0,type=DT_BFLOAT16)",Out537Conv2dconv2d,MXU,1,Compute,3104,3104,500,0,0,0,0,0,0,0,0,3104,384,0,0,2663296,"DT_BFLOAT16:[1,320,64,64],DT_BFLOAT16:[320,3,3,3]","[DT_BFLOAT16:(1,3,64,64)]",70778880,Out537Conv2dconv2d,Conv2D,17280,[],Conv2D,bf01;io01->bf01,"[[1, 320, 64, 64], [320, 3, 3, 3], [1, 3, 64, 64]]",1,2829696,192,1,3,4096,2880,0,3104,2663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,832,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,22.802474226804126,799.0939715473922,0.010788779669144091,0.10798567183072869,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json new file mode 100644 index 0000000..076ada8 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.json @@ -0,0 +1,184 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 1764186, + "overlapped_compute_time_non_pp_ns": 479234, + "compute_only_time_non_pp_ns": 970605, + "memory_only_time_non_pp_ns": 314347, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 1764186, + "overlapped_compute_time_chip_ns": 479234, + "compute_only_time_chip_ns": 970605, + "memory_only_time_chip_ns": 314347, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_requests_per_sec": 14.170841396542086, + "throughput_step_per_sec_per_request": 566.8336558616835, + "latency_sec": 0.07056744000000001, + "latency_step_sec": 4.410465e-05, + "mem_footprint_GB": 191.99999904632568, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "gligen", + "model_type": "gligen", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 3, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/gligen/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "num_diffusion_steps": 1, + "total_num_diffusion_steps": 40, + "image_resolution": [ + 512, + 512 + ], + "image_num_channels": 3, + "use_flash_attention": true, + "fourier_embedder_config": { + "num_freqs": 64 + }, + "text_embedder_config": { + "d_model": 512, + "num_heads": 8, + "d_head": 64, + "d_ff": 2048, + "num_layers": 12, + "ffn_type": "default" + }, + "image_embedder_config": { + "model_type": "vit", + "patch_size": 2, + "d_model": 1024, + "num_heads": 16, + "d_head": 64, + "d_ff": 4096, + "num_layers": 24, + "ffn_type": "default" + }, + "spatial_condition_embedder_config": { + "model_type": "convnext", + "stem": { + "in_channels": 3, + "out_channels": 96, + "kernel_size": 4, + "stride": 4 + }, + "depths": [ + 3, + 3, + 9, + 3 + ], + "dims": [ + 96, + 192, + 384, + 768 + ] + }, + "grounding_input_config": { + "text": { + "input_seqlen": 512, + "feature_dim": 768 + }, + "bbox": { + "input_seqlen": 8, + "feature_dim": 4, + "grounding_token_feature_dim": 768 + }, + "image": { + "resolution": [ + 1024, + 1024 + ], + "image_num_channels": 3 + }, + "keypoint": { + "num_persons": 10, + "num_keypoints": 17, + "feature_dim": 256 + }, + "spatial_condition": { + "resolution": [ + 256, + 256 + ], + "num_channels": 1 + } + }, + "unet_config": { + "noisy_latent_resolution": [ + 64, + 64 + ], + "model_channels": 320, + "attention_resolutions": [ + 4, + 2, + 1 + ], + "num_res_blocks": 2, + "channel_mult": [ + 1, + 2, + 4, + 4 + ], + "num_heads": 8, + "context_dim": 768 + }, + "output_dir": "./llava_ops" + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv new file mode 100644 index 0000000..3110cb5 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv @@ -0,0 +1,29 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,104167,46812,104167,0,0,0,0,0,0,0,0,0,46812,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,46812,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884860656445898,599.9980800061439,0.44938827624323024,0.9999968000102399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,5992230,5992230,208334,0,0,0,0,0,0,0,0,5992230,1123472,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,748982,5992230,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,770827,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,11.826475349577704,20.860347483324237,0.24263219188176424,0.034767245805540394,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 32], [32, 32, 2048], [1, 4096, 2048]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,104167,46812,104167,0,0,0,0,0,0,0,0,0,46812,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,46812,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884860656445898,599.9980800061439,0.44938827624323024,0.9999968000102399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,10485943,10485943,572917,0,0,0,0,0,0,0,0,10485943,655360,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,10485943,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1370688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.874399388972456,32.78198250743877,0.9411600452372567,0.05463663751239795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,10485943,10485943,572917,0,0,0,0,0,0,0,0,10485943,655360,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,10485943,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1370688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.874399388972456,32.78198250743877,0.9411600452372567,0.05463663751239795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,364584,20480,364584,0,0,0,0,0,0,0,0,0,20480,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,20480,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,43267,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.1610609790884954,599.9989028591491,0.05617361156825314,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,10485943,10485943,598959,0,0,0,0,0,0,0,0,10485943,655360,0,0,385875968,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,114688,1,4096,4096,14336,0,10485943,385875968,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1373412,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.874399388972456,34.27207262141326,0.9411600452372567,0.05712012103568876,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,104167,5852,104167,0,0,0,0,0,0,0,0,0,5852,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,5852,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16106075820557372,599.9980800061439,0.05617353453040378,0.9999968000102399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,12,500,0,0,0,0,0,0,0,0,0,12,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.022857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,52109,11702,52109,0,0,0,0,0,0,0,0,0,11702,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 2048], [2048, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,11702,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439277667965227,599.9972900854459,0.013210834238702297,0.9999954834757432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,104205,23405,104205,0,0,0,0,0,0,0,0,0,23405,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 1024], [2, 1024, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,23405,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10903,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440080994194137,599.9989269573797,0.013212482344312422,0.9999982115956328,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,52503,11702,52503,0,0,0,0,0,0,0,0,0,11702,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 1, 1], [1, 4096, 1, 1], [1, 1, 4096, 1]]",32,1057024,1024,32,1,4096,128,0,11702,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5493,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390955183513323,599.9994289760823,0.013111695738234726,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,6574,1462,6574,0,0,0,0,0,0,0,0,0,1462,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1462,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,687,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,916,206,916,0,0,0,0,0,0,0,0,0,206,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,206,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439126637554585,599.6903998362445,0.22457891453524642,0.9994839997270742,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,52503,11702,52503,0,0,0,0,0,0,0,0,0,11702,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 1], [1, 4096, 1, 128], [1, 1, 1, 128]]",32,264448,1024,32,1,128,4096,0,11702,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5493,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390955183513323,599.9994289760823,0.013111695738234726,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,6574,1462,6574,0,0,0,0,0,0,0,0,0,1462,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1462,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,687,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,52109,11702,52109,0,0,0,0,0,0,0,0,0,11702,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 64], [32, 64, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,11702,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439277667965227,599.9972900854459,0.013210834238702297,0.9999954834757432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,12,500,0,0,0,0,0,0,0,0,0,12,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.022857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,182349,40960,182349,0,0,0,0,0,0,0,0,0,40960,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,40960,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440425338225052,599.9996285989538,0.013213188801177316,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,182349,40960,182349,0,0,0,0,0,0,0,0,0,40960,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,40960,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440425338225052,599.9996285989538,0.013213188801177316,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,6,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.01,0.17801920572916666,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,182349,40960,182349,0,0,0,0,0,0,0,0,0,40960,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,40960,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440425338225052,599.9996285989538,0.013213188801177316,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv new file mode 100644 index 0000000..2c3f249 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv @@ -0,0 +1,17 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,12,500,0,0,0,0,0,0,0,0,0,12,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.022857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,52109,11702,52109,0,0,0,0,0,0,0,0,0,11702,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 2048], [2048, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,11702,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439277667965227,599.9972900854459,0.013210834238702297,0.9999954834757432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,104205,23405,104205,0,0,0,0,0,0,0,0,0,23405,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 1024], [2, 1024, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,23405,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10903,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440080994194137,599.9989269573797,0.013212482344312422,0.9999982115956328,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,52503,11702,52503,0,0,0,0,0,0,0,0,0,11702,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 1, 1], [1, 4096, 1, 1], [1, 1, 4096, 1]]",32,1057024,1024,32,1,4096,128,0,11702,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5493,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390955183513323,599.9994289760823,0.013111695738234726,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,6574,1462,6574,0,0,0,0,0,0,0,0,0,1462,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1462,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,687,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,916,206,916,0,0,0,0,0,0,0,0,0,206,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,206,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439126637554585,599.6903998362445,0.22457891453524642,0.9994839997270742,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,52503,11702,52503,0,0,0,0,0,0,0,0,0,11702,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 1], [1, 4096, 1, 128], [1, 1, 1, 128]]",32,264448,1024,32,1,128,4096,0,11702,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5493,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390955183513323,599.9994289760823,0.013111695738234726,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,6574,1462,6574,0,0,0,0,0,0,0,0,0,1462,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1462,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,687,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,52109,11702,52109,0,0,0,0,0,0,0,0,0,11702,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 64], [32, 64, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,11702,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439277667965227,599.9972900854459,0.013210834238702297,0.9999954834757432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,12,500,0,0,0,0,0,0,0,0,0,12,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.022857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,182349,40960,182349,0,0,0,0,0,0,0,0,0,40960,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,40960,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440425338225052,599.9996285989538,0.013213188801177316,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,182349,40960,182349,0,0,0,0,0,0,0,0,0,40960,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,40960,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440425338225052,599.9996285989538,0.013213188801177316,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,6,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.01,0.17801920572916666,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,182349,40960,182349,0,0,0,0,0,0,0,0,0,40960,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,40960,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19079,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6440425338225052,599.9996285989538,0.013213188801177316,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json new file mode 100644 index 0000000..c11ceb5 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 14369423360, + "overlapped_compute_time_non_pp_ns": 3215474688, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 11153948672, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 14369423360, + "overlapped_compute_time_chip_ns": 3215474688, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 11153948672, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 28.06528, + "throughput_tokens_per_sec": 35.631214083736204, + "throughput_tokens_per_sec_request": 35.631214083736204, + "mem_footprint_GB": 22.5625, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv new file mode 100644 index 0000000..8b8ed86 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv @@ -0,0 +1,13 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,104167,46812,104167,0,0,0,0,0,0,0,0,0,46812,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,46812,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884860656445898,599.9980800061439,0.44938827624323024,0.9999968000102399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,5992230,5992230,208334,0,0,0,0,0,0,0,0,5992230,1123472,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,748982,5992230,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,770827,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,11.826475349577704,20.860347483324237,0.24263219188176424,0.034767245805540394,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,2996115,2996115,208334,0,0,0,0,0,0,0,0,2996115,187245,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 32], [32, 32, 2048], [1, 4096, 2048]]",2,50331648,32768,1,4096,4096,4096,0,2996115,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,396312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.87238923472563,41.72069496664847,0.9411188048747217,0.06953449161108079,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,104167,46812,104167,0,0,0,0,0,0,0,0,0,46812,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,46812,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,22602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884860656445898,599.9980800061439,0.44938827624323024,0.9999968000102399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,10485943,10485943,572917,0,0,0,0,0,0,0,0,10485943,655360,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,10485943,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1370688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.874399388972456,32.78198250743877,0.9411600452372567,0.05463663751239795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,10485943,10485943,572917,0,0,0,0,0,0,0,0,10485943,655360,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,10485943,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1370688,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.874399388972456,32.78198250743877,0.9411600452372567,0.05463663751239795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,364584,20480,364584,0,0,0,0,0,0,0,0,0,20480,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,20480,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,43267,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.1610609790884954,599.9989028591491,0.05617361156825314,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,10485943,10485943,598959,0,0,0,0,0,0,0,0,10485943,655360,0,0,385875968,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,114688,1,4096,4096,14336,0,10485943,385875968,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1373412,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.874399388972456,34.27207262141326,0.9411600452372567,0.05712012103568876,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,104167,5852,104167,0,0,0,0,0,0,0,0,0,5852,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,5852,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,12362,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16106075820557372,599.9980800061439,0.05617353453040378,0.9999968000102399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json new file mode 100644 index 0000000..0d59d33 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 1603571328, + "overlapped_compute_time_non_pp_ns": 93005408, + "compute_only_time_non_pp_ns": 1492737792, + "memory_only_time_non_pp_ns": 17828128, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 1603571328, + "overlapped_compute_time_chip_ns": 93005408, + "compute_only_time_chip_ns": 1492737792, + "memory_only_time_chip_ns": 17828128, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 2554.298601178282, + "TTFT_sec": 1.603571328, + "mem_footprint_GB": 21.5, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv new file mode 100644 index 0000000..c1e09eb --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv @@ -0,0 +1,29 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,69445,34860,69445,0,0,0,0,0,0,0,0,0,34860,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,34860,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16831,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327198214414285,899.9928000575995,0.5019738565495732,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,2231150,2231150,138889,0,0,0,0,0,0,0,0,2231150,836629,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,557753,2231150,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,574020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.7625262236963,56.024919884364564,0.24998462294752113,0.06224991098262729,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 32], [32, 32, 2048], [1, 4096, 2048]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,69445,34860,69445,0,0,0,0,0,0,0,0,0,34860,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,34860,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16831,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327198214414285,899.9928000575995,0.5019738565495732,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,3904341,3904341,381945,0,0,0,0,0,0,0,0,3904341,488034,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,3904341,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1020725,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20551333810239,88.04302697945697,0.9696799171441056,0.09782558553272996,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,3904341,3904341,381945,0,0,0,0,0,0,0,0,3904341,488034,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,3904341,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1020725,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20551333810239,88.04302697945697,0.9696799171441056,0.09782558553272996,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,243056,15252,243056,0,0,0,0,0,0,0,0,0,15252,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,15252,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,32220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24159146863274308,899.9983542887236,0.06274711930496361,0.999998171431915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,3904341,3904341,399306,0,0,0,0,0,0,0,0,3904341,488034,0,0,385875968,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,114688,1,4096,4096,14336,0,3904341,385875968,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1022754,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20551333810239,92.04498275125047,0.9696799171441056,0.10227220305694497,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,69445,4358,69445,0,0,0,0,0,0,0,0,0,4358,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,4358,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9205,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24158997768017856,899.9928000575995,0.06274673206869665,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,9,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.01702127659574468,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,34740,8714,34740,0,0,0,0,0,0,0,0,0,8714,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 2048], [2048, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,8714,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9658731145653425,899.9786640490069,0.007601833199892951,0.9999762933877854,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,69470,17429,69470,0,0,0,0,0,0,0,0,0,17429,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 1024], [2, 1024, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,17429,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8119,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660121491291205,899.9983904360695,0.00760292746118558,0.9999982115956327,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,35002,8714,35002,0,0,0,0,0,0,0,0,0,8714,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 1, 1], [1, 4096, 1, 1], [1, 1, 4096, 1]]",32,1057024,1024,32,1,4096,128,0,8714,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,4383,1089,4383,0,0,0,0,0,0,0,0,0,1089,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1089,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,512,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9569482089892767,899.9308630290326,0.007531590387984289,0.9999231811433695,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,611,154,611,0,0,0,0,0,0,0,0,0,154,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,154,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,109,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.965342062193126,899.0448547463175,0.2507225685134241,0.9989387274959083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,35002,8714,35002,0,0,0,0,0,0,0,0,0,8714,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 1], [1, 4096, 1, 128], [1, 1, 1, 128]]",32,264448,1024,32,1,128,4096,0,8714,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,4383,1089,4383,0,0,0,0,0,0,0,0,0,1089,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1089,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,512,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9569482089892767,899.9308630290326,0.007531590387984289,0.9999231811433695,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002127659574468085,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,34740,8714,34740,0,0,0,0,0,0,0,0,0,8714,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 64], [32, 64, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,8714,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9658731145653425,899.9786640490069,0.007601833199892951,0.9999762933877854,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,9,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.01702127659574468,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,121566,30502,121566,0,0,0,0,0,0,0,0,0,30502,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,30502,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660638007337579,899.9994428984307,0.00760333398133511,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,121566,30502,121566,0,0,0,0,0,0,0,0,0,30502,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,30502,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660638007337579,899.9994428984307,0.00760333398133511,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,5,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.007446808510638298,0.1186794704861111,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,121566,30502,121566,0,0,0,0,0,0,0,0,0,30502,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,30502,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660638007337579,899.9994428984307,0.00760333398133511,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002127659574468085,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv new file mode 100644 index 0000000..ec1ba81 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv @@ -0,0 +1,17 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,9,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.01702127659574468,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,34740,8714,34740,0,0,0,0,0,0,0,0,0,8714,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 2048], [2048, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,8714,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9658731145653425,899.9786640490069,0.007601833199892951,0.9999762933877854,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,69470,17429,69470,0,0,0,0,0,0,0,0,0,17429,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 1024], [2, 1024, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,17429,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8119,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660121491291205,899.9983904360695,0.00760292746118558,0.9999982115956327,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,35002,8714,35002,0,0,0,0,0,0,0,0,0,8714,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 1, 1], [1, 4096, 1, 1], [1, 1, 4096, 1]]",32,1057024,1024,32,1,4096,128,0,8714,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,4383,1089,4383,0,0,0,0,0,0,0,0,0,1089,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1089,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,512,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9569482089892767,899.9308630290326,0.007531590387984289,0.9999231811433695,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,611,154,611,0,0,0,0,0,0,0,0,0,154,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,154,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,109,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.965342062193126,899.0448547463175,0.2507225685134241,0.9989387274959083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,35002,8714,35002,0,0,0,0,0,0,0,0,0,8714,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 1], [1, 4096, 1, 128], [1, 1, 1, 128]]",32,264448,1024,32,1,128,4096,0,8714,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4090,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,4383,1089,4383,0,0,0,0,0,0,0,0,0,1089,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1089,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,512,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9569482089892767,899.9308630290326,0.007531590387984289,0.9999231811433695,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002127659574468085,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,34740,8714,34740,0,0,0,0,0,0,0,0,0,8714,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 64], [32, 64, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,8714,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9658731145653425,899.9786640490069,0.007601833199892951,0.9999762933877854,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,9,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.01702127659574468,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,121566,30502,121566,0,0,0,0,0,0,0,0,0,30502,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,30502,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660638007337579,899.9994428984307,0.00760333398133511,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,121566,30502,121566,0,0,0,0,0,0,0,0,0,30502,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 1024], [1024, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,30502,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660638007337579,899.9994428984307,0.00760333398133511,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,5,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.007446808510638298,0.1186794704861111,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,121566,30502,121566,0,0,0,0,0,0,0,0,0,30502,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,30502,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9660638007337579,899.9994428984307,0.00760333398133511,0.9999993809982564,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.002127659574468085,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json new file mode 100644 index 0000000..e455858 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 9593307136, + "overlapped_compute_time_non_pp_ns": 2394521600, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 7198785536, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 9593307136, + "overlapped_compute_time_chip_ns": 2394521600, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 7198785536, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 18.736928, + "throughput_tokens_per_sec": 53.370541851898025, + "throughput_tokens_per_sec_request": 53.37054185189803, + "mem_footprint_GB": 22.5625, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv new file mode 100644 index 0000000..2c65e9f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv @@ -0,0 +1,13 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,69445,34860,69445,0,0,0,0,0,0,0,0,0,34860,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,34860,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16831,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327198214414285,899.9928000575995,0.5019738565495732,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 32, 64], [1, 4096, 32, 64]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,2231150,2231150,138889,0,0,0,0,0,0,0,0,2231150,836629,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,557753,2231150,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,574020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,31.7625262236963,56.024919884364564,0.24998462294752113,0.06224991098262729,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,1115575,1115575,138889,0,0,0,0,0,0,0,0,1115575,139438,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 32], [32, 32, 2048], [1, 4096, 2048]]",2,50331648,32768,1,4096,4096,4096,0,1115575,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,295126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20010171615534,112.04983976872913,0.969637325372203,0.12449982196525458,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,69445,34860,69445,0,0,0,0,0,0,0,0,0,34860,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,34860,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16831,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327198214414285,899.9928000575995,0.5019738565495732,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,3904341,3904341,381945,0,0,0,0,0,0,0,0,3904341,488034,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,3904341,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1020725,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20551333810239,88.04302697945697,0.9696799171441056,0.09782558553272996,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,3904341,3904341,381945,0,0,0,0,0,0,0,0,3904341,488034,0,0,369098752,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",4,155189248,114688,1,4096,14336,4096,0,3904341,369098752,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1020725,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20551333810239,88.04302697945697,0.9696799171441056,0.09782558553272996,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,243056,15252,243056,0,0,0,0,0,0,0,0,0,15252,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,15252,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,32220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24159146863274308,899.9983542887236,0.06274711930496361,0.999998171431915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,3904341,3904341,399306,0,0,0,0,0,0,0,0,3904341,488034,0,0,385875968,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,114688,1,4096,4096,14336,0,3904341,385875968,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1022754,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.20551333810239,92.04498275125047,0.9696799171441056,0.10227220305694497,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,69445,4358,69445,0,0,0,0,0,0,0,0,0,4358,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,4358,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9205,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24158997768017856,899.9928000575995,0.06274673206869665,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json new file mode 100644 index 0000000..46f2b0e --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 603451648, + "overlapped_compute_time_non_pp_ns": 62303072, + "compute_only_time_non_pp_ns": 529562624, + "memory_only_time_non_pp_ns": 11585952, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 603451648, + "overlapped_compute_time_chip_ns": 62303072, + "compute_only_time_chip_ns": 529562624, + "memory_only_time_chip_ns": 11585952, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 6787.619212865254, + "TTFT_sec": 0.603451648, + "mem_footprint_GB": 21.5, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv new file mode 100644 index 0000000..9f827f3 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv @@ -0,0 +1,29 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,52084,31208,52084,0,0,0,0,0,0,0,0,0,31208,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,31208,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11435,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,998706,998706,104167,0,0,0,0,0,0,0,0,998706,748981,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,499321,998706,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,256942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,70.95878104667439,125.16195957569094,0.2538303465783625,0.10430163297974246,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,52084,31208,52084,0,0,0,0,0,0,0,0,0,31208,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,31208,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11435,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1747658,1747658,208334,0,0,0,0,0,0,0,0,1747658,436906,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1747658,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,451446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2462650884784,143.04858273186173,0.9845977316866932,0.11920715227655145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1747658,1747658,208334,0,0,0,0,0,0,0,0,1747658,436906,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1747658,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,451446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2462650884784,143.04858273186173,0.9845977316866932,0.11920715227655145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,182292,13654,182292,0,0,0,0,0,0,0,0,0,13654,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,13654,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3221219581769908,1199.9978057182982,0.07489814875767085,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1747658,1747658,208334,0,0,0,0,0,0,0,0,1747658,436906,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1747658,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,451446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2462650884784,143.04858273186173,0.9845977316866932,0.11920715227655145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,52084,3901,52084,0,0,0,0,0,0,0,0,0,3901,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,3901,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4608,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32211842408417174,1199.9846401966056,0.07489732702849976,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,8,500,0,0,0,0,0,0,0,0,0,8,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.015238095238095238,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,26055,15635,26055,0,0,0,0,0,0,0,0,15635,3900,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,15635,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2878308194204569,1199.9715520653426,0.004606766610220843,0.9999762933877855,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,52103,31239,52103,0,0,0,0,0,0,0,0,31239,7801,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,31239,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880038385505632,1199.986338283664,0.004607385525950676,0.9999886152363867,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,26252,15635,26252,0,0,0,0,0,0,0,0,15635,3900,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 4096, 32, 128], [1, 1, 4096, 32]]",1,1057024,1024,32,1,4096,128,0,15635,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,3287,1981,3287,0,0,0,0,0,0,0,0,1981,487,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1981,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,724,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.276027989047764,1199.9990789949043,0.004564546091774568,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,138,500,0,0,0,0,0,0,0,0,0,138,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,138,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,69,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.179648,1098.6328125,0.2742857142857143,0.91552734375,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,26252,15635,26252,0,0,0,0,0,0,0,0,15635,3900,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 32], [1, 4096, 32, 128], [1, 1, 32, 128]]",1,264448,1024,32,1,128,4096,0,15635,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,3287,1981,3287,0,0,0,0,0,0,0,0,1981,487,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1981,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,724,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.276027989047764,1199.9990789949043,0.004564546091774568,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0019047619047619048,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,26055,15635,26055,0,0,0,0,0,0,0,0,15635,3900,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 128], [32, 128, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,15635,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2878308194204569,1199.9715520653426,0.004606766610220843,0.9999762933877855,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,8,500,0,0,0,0,0,0,0,0,0,8,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.015238095238095238,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,91175,54644,91175,0,0,0,0,0,0,0,0,54644,13653,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,54644,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,20020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880780038387716,1199.9926764506786,0.004607650826460808,0.9999938970422322,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,91175,54644,91175,0,0,0,0,0,0,0,0,54644,13653,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,54644,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,20020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880780038387716,1199.9926764506786,0.004607650826460808,0.9999938970422322,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,4,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.006666666666666667,0.08900960286458333,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,91175,54644,91175,0,0,0,0,0,0,0,0,54644,13653,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 14336], [14336, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,54644,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,20020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880780038387716,1199.9926764506786,0.004607650826460808,0.9999938970422322,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0019047619047619048,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv new file mode 100644 index 0000000..4b6460f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv @@ -0,0 +1,17 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,8,500,0,0,0,0,0,0,0,0,0,8,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.015238095238095238,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,26055,15635,26055,0,0,0,0,0,0,0,0,15635,3900,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,15635,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2878308194204569,1199.9715520653426,0.004606766610220843,0.9999762933877855,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,52103,31239,52103,0,0,0,0,0,0,0,0,31239,7801,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,31239,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880038385505632,1199.986338283664,0.004607385525950676,0.9999886152363867,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,26252,15635,26252,0,0,0,0,0,0,0,0,15635,3900,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 4096, 32, 128], [1, 1, 4096, 32]]",1,1057024,1024,32,1,4096,128,0,15635,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,3287,1981,3287,0,0,0,0,0,0,0,0,1981,487,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1981,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,724,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.276027989047764,1199.9990789949043,0.004564546091774568,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,138,500,0,0,0,0,0,0,0,0,0,138,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,138,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,69,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.179648,1098.6328125,0.2742857142857143,0.91552734375,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,26252,15635,26252,0,0,0,0,0,0,0,0,15635,3900,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 32], [1, 4096, 32, 128], [1, 1, 32, 128]]",1,264448,1024,32,1,128,4096,0,15635,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,3287,1981,3287,0,0,0,0,0,0,0,0,1981,487,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1981,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,724,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.276027989047764,1199.9990789949043,0.004564546091774568,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0019047619047619048,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,26055,15635,26055,0,0,0,0,0,0,0,0,15635,3900,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 128], [32, 128, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,15635,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2878308194204569,1199.9715520653426,0.004606766610220843,0.9999762933877855,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,8,500,0,0,0,0,0,0,0,0,0,8,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.015238095238095238,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,91175,54644,91175,0,0,0,0,0,0,0,0,54644,13653,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,54644,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,20020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880780038387716,1199.9926764506786,0.004607650826460808,0.9999938970422322,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,91175,54644,91175,0,0,0,0,0,0,0,0,54644,13653,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,54644,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,20020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880780038387716,1199.9926764506786,0.004607650826460808,0.9999938970422322,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,4,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.006666666666666667,0.08900960286458333,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,91175,54644,91175,0,0,0,0,0,0,0,0,54644,13653,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 14336], [14336, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,54644,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,20020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2880780038387716,1199.9926764506786,0.004607650826460808,0.9999938970422322,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0019047619047619048,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json new file mode 100644 index 0000000..d2c7342 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 7205945344, + "overlapped_compute_time_non_pp_ns": 4289871872, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 2916073472, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 7205945344, + "overlapped_compute_time_chip_ns": 4289871872, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 2916073472, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 14.074112, + "throughput_tokens_per_sec": 71.05244011131929, + "throughput_tokens_per_sec_request": 71.05244011131929, + "mem_footprint_GB": 22.5625, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv new file mode 100644 index 0000000..04c1893 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv @@ -0,0 +1,13 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,52084,31208,52084,0,0,0,0,0,0,0,0,0,31208,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,31208,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11435,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,998706,998706,104167,0,0,0,0,0,0,0,0,998706,748981,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,499321,998706,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,256942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,70.95878104667439,125.16195957569094,0.2538303465783625,0.10430163297974246,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,499353,499353,78125,0,0,0,0,0,0,0,0,499353,124830,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,499353,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,130287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2340598174037,187.7429393635364,0.9845540715766788,0.15645244946961367,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,52084,31208,52084,0,0,0,0,0,0,0,0,0,31208,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,31208,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11435,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1747658,1747658,208334,0,0,0,0,0,0,0,0,1747658,436906,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1747658,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,451446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2462650884784,143.04858273186173,0.9845977316866932,0.11920715227655145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1747658,1747658,208334,0,0,0,0,0,0,0,0,1747658,436906,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1747658,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,451446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2462650884784,143.04858273186173,0.9845977316866932,0.11920715227655145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,182292,13654,182292,0,0,0,0,0,0,0,0,0,13654,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,13654,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16129,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3221219581769908,1199.9978057182982,0.07489814875767085,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1747658,1747658,208334,0,0,0,0,0,0,0,0,1747658,436906,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1747658,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,451446,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2462650884784,143.04858273186173,0.9845977316866932,0.11920715227655145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,52084,3901,52084,0,0,0,0,0,0,0,0,0,3901,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,3901,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4608,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32211842408417174,1199.9846401966056,0.07489732702849976,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json new file mode 100644 index 0000000..a1dcc08 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 274484352, + "overlapped_compute_time_non_pp_ns": 35892480, + "compute_only_time_non_pp_ns": 230317536, + "memory_only_time_non_pp_ns": 8274336, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 274484352, + "overlapped_compute_time_chip_ns": 35892480, + "compute_only_time_chip_ns": 230317536, + "memory_only_time_chip_ns": 8274336, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 14922.526439685713, + "TTFT_sec": 0.274484352, + "mem_footprint_GB": 21.5, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv new file mode 100644 index 0000000..1b16b81 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv @@ -0,0 +1,29 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,616848,616848,45208,0,0,0,0,0,0,0,0,616848,308405,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,205603,616848,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,105799,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,114.88561263714885,202.64311467330688,0.2518926382823657,0.07328864906810376,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,79114,5623,79114,0,0,0,0,0,0,0,0,0,5623,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,5623,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6641,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,22604,1607,22604,0,0,0,0,0,0,0,0,0,1607,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1607,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.0062745098039215675,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,11308,9657,11308,0,0,0,0,0,0,0,0,9657,1606,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,9657,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9673180049522463,2764.8796240769807,0.006505997955121639,0.9999564644039713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,22613,19295,22613,0,0,0,0,0,0,0,0,19295,3212,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,19295,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4712,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9677116702781587,2764.909042745047,0.006506861086677177,0.9999671040669248,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,11394,9657,11394,0,0,0,0,0,0,0,0,9657,1606,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 4096, 32, 128], [1, 1, 4096, 32]]",1,1057024,1024,32,1,4096,128,0,9657,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,1427,1224,1427,0,0,0,0,0,0,0,0,1224,201,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1224,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9392459705676246,2764.1184111115977,0.006444448570122241,0.99968116134235,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,57,500,0,0,0,0,0,0,0,0,0,57,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,57,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,47,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.179648,1098.6328125,0.11294117647058823,0.3973355560578662,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,11394,9657,11394,0,0,0,0,0,0,0,0,9657,1606,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 32], [1, 4096, 32, 128], [1, 1, 32, 128]]",1,264448,1024,32,1,128,4096,0,9657,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,1427,1224,1427,0,0,0,0,0,0,0,0,1224,201,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1224,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9392459705676246,2764.1184111115977,0.006444448570122241,0.99968116134235,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0007843137254901959,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,11308,9657,11308,0,0,0,0,0,0,0,0,9657,1606,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 128], [32, 128, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,9657,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9673180049522463,2764.8796240769807,0.006505997955121639,0.9999564644039713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.0062745098039215675,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,39570,33751,39570,0,0,0,0,0,0,0,0,33751,5622,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,33751,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9679179176143546,2764.9565902297354,0.006507313294612188,0.9999843002639188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,39570,33751,39570,0,0,0,0,0,0,0,0,33751,5622,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,33751,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9679179176143546,2764.9565902297354,0.006507313294612188,0.9999843002639188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.002745098039215686,0.038629845727848104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,39570,33751,39570,0,0,0,0,0,0,0,0,33751,5622,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 14336], [14336, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,33751,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9679179176143546,2764.9565902297354,0.006507313294612188,0.9999843002639188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0007843137254901959,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv new file mode 100644 index 0000000..38b810c --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv @@ -0,0 +1,17 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.0062745098039215675,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,11308,9657,11308,0,0,0,0,0,0,0,0,9657,1606,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 32, 128], [1, 1, 32, 128]]",1,8398848,1024,1,1,4096,4096,0,9657,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9673180049522463,2764.8796240769807,0.006505997955121639,0.9999564644039713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,22613,19295,22613,0,0,0,0,0,0,0,0,19295,3212,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 32, 128], [1, 2, 1, 32, 128]]",1,16795648,2048,1,1,8192,4096,0,19295,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4712,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9677116702781587,2764.909042745047,0.006506861086677177,0.9999671040669248,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,11394,9657,11394,0,0,0,0,0,0,0,0,9657,1606,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 4096, 32, 128], [1, 1, 4096, 32]]",1,1057024,1024,32,1,4096,128,0,9657,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,1427,1224,1427,0,0,0,0,0,0,0,0,1224,201,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,128,32,1,512,128,0,1224,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9392459705676246,2764.1184111115977,0.006444448570122241,0.99968116134235,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,57,500,0,0,0,0,0,0,0,0,0,57,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,57,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,47,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.179648,1098.6328125,0.11294117647058823,0.3973355560578662,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,11394,9657,11394,0,0,0,0,0,0,0,0,9657,1606,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 32], [1, 4096, 32, 128], [1, 1, 32, 128]]",1,264448,1024,32,1,128,4096,0,9657,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,1427,1224,1427,0,0,0,0,0,0,0,0,1224,201,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,128,32,1,128,512,0,1224,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9392459705676246,2764.1184111115977,0.006444448570122241,0.99968116134235,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0007843137254901959,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,11308,9657,11308,0,0,0,0,0,0,0,0,9657,1606,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 128], [32, 128, 4096], [1, 1, 4096]]",1,8398848,1024,1,1,4096,4096,0,9657,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9673180049522463,2764.8796240769807,0.006505997955121639,0.9999564644039713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.0062745098039215675,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,39570,33751,39570,0,0,0,0,0,0,0,0,33751,5622,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,33751,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9679179176143546,2764.9565902297354,0.006507313294612188,0.9999843002639188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,39570,33751,39570,0,0,0,0,0,0,0,0,33751,5622,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,29390848,3584,1,1,14336,4096,0,33751,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9679179176143546,2764.9565902297354,0.006507313294612188,0.9999843002639188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.002745098039215686,0.038629845727848104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,39570,33751,39570,0,0,0,0,0,0,0,0,33751,5622,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 14336], [14336, 4096], [1, 1, 4096]]",1,8398848,3584,1,1,4096,14336,0,33751,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9679179176143546,2764.9565902297354,0.006507313294612188,0.9999843002639188,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0007843137254901959,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json new file mode 100644 index 0000000..dc6f75a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 3155247104, + "overlapped_compute_time_non_pp_ns": 2649178112, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 506068992, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 3155247104, + "overlapped_compute_time_chip_ns": 2649178112, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 506068992, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 6.162592, + "throughput_tokens_per_sec": 162.2693827532311, + "throughput_tokens_per_sec_request": 162.2693827532311, + "mem_footprint_GB": 22.5625, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv new file mode 100644 index 0000000..1121c8b --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv @@ -0,0 +1,13 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,616848,616848,45208,0,0,0,0,0,0,0,0,616848,308405,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,65536,32,4096,4096,128,205603,616848,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,105799,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,114.88561263714885,202.64311467330688,0.2518926382823657,0.07328864906810376,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,79114,5623,79114,0,0,0,0,0,0,0,0,0,5623,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,5623,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6641,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,22604,1607,22604,0,0,0,0,0,0,0,0,0,1607,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1607,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json new file mode 100644 index 0000000..260f244 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 167544896, + "overlapped_compute_time_non_pp_ns": 15520384, + "compute_only_time_non_pp_ns": 148376704, + "memory_only_time_non_pp_ns": 3647808, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 167544896, + "overlapped_compute_time_chip_ns": 15520384, + "compute_only_time_chip_ns": 148376704, + "memory_only_time_chip_ns": 3647808, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 24447.178623692602, + "TTFT_sec": 0.167544896, + "mem_footprint_GB": 21.5, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv new file mode 100644 index 0000000..636b063 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv @@ -0,0 +1,29 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,262208,262208,16892,0,0,0,0,0,0,0,0,262208,163840,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,6356992,16384,32,4096,4096,128,131072,262208,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,67459,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,270.2700161093483,476.72077129607027,0.12787575707693094,0.0644217258508203,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,29561,3584,29561,0,0,0,0,0,0,0,0,0,3584,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3584,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 14336], [14336, 4096], [1, 4096, 4096]]",1,67108864,14336,1,4096,4096,14336,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,8446,1024,8446,0,0,0,0,0,0,0,0,0,1024,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1024,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.004,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,4226,4128,4226,0,0,0,0,0,0,0,0,4128,512,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 32, 128], [1, 1, 32, 128]]",1,16789504,256,1,1,4096,4096,0,4128,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.939998106956933,7398.310172518339,0.0037567366285489972,0.9997716449349107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,8450,8224,8450,0,0,0,0,0,0,0,0,8224,1024,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 32, 128], [1, 2, 1, 32, 128]]",1,33574912,512,1,1,8192,4096,0,8224,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3010,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941877396449704,7399.158364922338,0.0037576257969817897,0.9998862655300457,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,4257,2048,4257,0,0,0,0,0,0,0,0,0,2048,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 4096, 32, 128], [1, 1, 4096, 32]]",1,1057024,512,32,1,4096,128,0,2048,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,480,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8821780596664315,7399.992957371682,0.003729379608233042,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,534,256,534,0,0,0,0,0,0,0,0,0,256,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,64,32,1,512,128,0,256,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.854501872659176,7386.511184749532,0.003716284876462561,0.9981771871283152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,36,500,0,0,0,0,0,0,0,0,0,36,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,36,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.179648,1098.6328125,0.072,0.14846389358108109,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,4257,2048,4257,0,0,0,0,0,0,0,0,0,2048,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 32], [1, 4096, 32, 128], [1, 1, 32, 128]]",1,528640,512,32,1,128,4096,0,2048,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,480,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8821780596664315,7399.992957371682,0.003729379608233042,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,534,256,534,0,0,0,0,0,0,0,0,0,256,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,64,32,1,128,512,0,256,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.854501872659176,7386.511184749532,0.003716284876462561,0.9981771871283152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0005,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,4226,4128,4226,0,0,0,0,0,0,0,0,4128,512,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 128], [32, 128, 4096], [1, 1, 4096]]",1,16789504,256,1,1,4096,4096,0,4128,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.939998106956933,7398.310172518339,0.0037567366285489972,0.9997716449349107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.004,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,14786,14368,14786,0,0,0,0,0,0,0,0,14368,1792,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,58753024,896,1,1,14336,4096,0,14368,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.942683078587853,7399.521998876682,0.0037580069980297725,0.9999354052536057,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,14786,14368,14786,0,0,0,0,0,0,0,0,14368,1792,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,58753024,896,1,1,14336,4096,0,14368,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.942683078587853,7399.521998876682,0.0037580069980297725,0.9999354052536057,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.00175,0.014433989653716216,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,14786,14368,14786,0,0,0,0,0,0,0,0,14368,1792,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 14336], [14336, 4096], [1, 1, 4096]]",1,16789504,896,1,1,4096,14336,0,14368,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.942683078587853,7399.521998876682,0.0037580069980297725,0.9999354052536057,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0005,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv new file mode 100644 index 0000000..734c07c --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv @@ -0,0 +1,17 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +12,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.004,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,4226,4128,4226,0,0,0,0,0,0,0,0,4128,512,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 32, 128], [1, 1, 32, 128]]",1,16789504,256,1,1,4096,4096,0,4128,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.939998106956933,7398.310172518339,0.0037567366285489972,0.9997716449349107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x32x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,8450,8224,8450,0,0,0,0,0,0,0,0,8224,1024,0,0,67133440,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,32,128]","[DT_BFLOAT16:(1,2,1,32,128)]",67108864,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,67108864,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 32, 128], [1, 2, 1, 32, 128]]",1,33574912,512,1,1,8192,4096,0,8224,67133440,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3010,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941877396449704,7399.158364922338,0.0037576257969817897,0.9998862655300457,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,4257,2048,4257,0,0,0,0,0,0,0,0,0,2048,0,0,33824768,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,4096,32)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 4096, 32, 128], [1, 1, 4096, 32]]",1,1057024,512,32,1,4096,128,0,2048,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,480,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8821780596664315,7399.992957371682,0.003729379608233042,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x32x128,b=1x512x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,534,256,534,0,0,0,0,0,0,0,0,0,256,0,0,4235264,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,512,32)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 32, 128], [1, 512, 32, 128], [1, 1, 512, 32]]",1,132352,64,32,1,512,128,0,256,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.854501872659176,7386.511184749532,0.003716284876462561,0.9981771871283152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x32,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,36,500,0,0,0,0,0,0,0,0,0,36,0,0,589824,"DT_BFLOAT16:[1,1,4608,32]","[DT_BFLOAT16:(1,1,4608,32)]",589824,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,36,589824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.179648,1098.6328125,0.072,0.14846389358108109,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,4257,2048,4257,0,0,0,0,0,0,0,0,0,2048,0,0,33824768,"DT_BFLOAT16:[1,1,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,1,32,128)]",33554432,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 32], [1, 4096, 32, 128], [1, 1, 32, 128]]",1,528640,512,32,1,128,4096,0,2048,33824768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,480,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8821780596664315,7399.992957371682,0.003729379608233042,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x32,b=1x512x32x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,534,256,534,0,0,0,0,0,0,0,0,0,256,0,0,4235264,"DT_BFLOAT16:[1,1,512,32],DT_BFLOAT16:[1,512,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4194304,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 32], [1, 512, 32, 128], [1, 1, 32, 128]]",1,132352,64,32,1,128,512,0,256,4235264,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.854501872659176,7386.511184749532,0.003716284876462561,0.9981771871283152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x32x128,b=1x1x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,32,128]","[DT_BFLOAT16:(1,1,32,128)]",4096,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0005,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,4226,4128,4226,0,0,0,0,0,0,0,0,4128,512,0,0,33570816,"DT_BFLOAT16:[1,1,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,1,4096)]",33554432,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 1, 32, 128], [32, 128, 4096], [1, 1, 4096]]",1,16789504,256,1,1,4096,4096,0,4128,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.939998106956933,7398.310172518339,0.0037567366285489972,0.9997716449349107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.004,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,14786,14368,14786,0,0,0,0,0,0,0,0,14368,1792,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,58753024,896,1,1,14336,4096,0,14368,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.942683078587853,7399.521998876682,0.0037580069980297725,0.9999354052536057,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,14786,14368,14786,0,0,0,0,0,0,0,0,14368,1792,0,0,117477376,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,1,14336)]",117440512,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 14336], [1, 1, 14336]]",1,58753024,896,1,1,14336,4096,0,14368,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.942683078587853,7399.521998876682,0.0037580069980297725,0.9999354052536057,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x14336,b=1x1x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,57344,"DT_BFLOAT16:[1,1,14336]","[DT_BFLOAT16:(1,1,14336)]",14336,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1,57344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.028672,106.8115234375,0.00175,0.014433989653716216,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,14786,14368,14786,0,0,0,0,0,0,0,0,14368,1792,0,0,117477376,"DT_BFLOAT16:[1,1,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,1,4096)]",117440512,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 1, 14336], [14336, 4096], [1, 1, 4096]]",1,16789504,896,1,1,4096,14336,0,14368,117477376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5261,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.942683078587853,7399.521998876682,0.0037580069980297725,0.9999354052536057,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x4096,b=1x1x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",4096,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.008192,30.517578125,0.0005,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json new file mode 100644 index 0000000..be29942 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 1209827328, + "overlapped_compute_time_non_pp_ns": 1052426240, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 157401088, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 1209827328, + "overlapped_compute_time_chip_ns": 1052426240, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 157401088, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 2.362944, + "throughput_tokens_per_sec": 423.2008883833049, + "throughput_tokens_per_sec_request": 423.20088838330486, + "mem_footprint_GB": 22.5625, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv new file mode 100644 index 0000000..7528696 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv @@ -0,0 +1,13 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-Q-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q3MatMulQ,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,Q3MatMulQ,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-K-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K3MatMulK,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,K3MatMulK,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,-V-3,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V3MatMulV,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,V3MatMulV,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-FlashAttention-4,"FlashAttention(q=1x4096x32x128,k=1x4096x32x128,v=1x4096x32x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention4FlashAttention,MXU,32,Compute,262208,262208,16892,0,0,0,0,0,0,0,0,262208,163840,0,0,134217728,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,32)]",70866960384,FlashAttention4FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,6356992,16384,32,4096,4096,128,131072,262208,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,67459,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,270.2700161093483,476.72077129607027,0.12787575707693094,0.0644217258508203,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-Attention_output-5,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput5MatMulattnOutputattnAvgWo,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,Attentionoutput5MatMulattnOutputattnAvgWo,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_layernorm-6,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm6YnormLayerNormy,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,Attentionlayernorm6YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,29561,3584,29561,0,0,0,0,0,0,0,0,0,3584,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3584,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 14336], [14336, 4096], [1, 4096, 4096]]",1,67108864,14336,1,4096,4096,14336,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,8446,1024,8446,0,0,0,0,0,0,0,0,0,1024,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1024,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json new file mode 100644 index 0000000..441b00d --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 40563296, + "overlapped_compute_time_non_pp_ns": 6077184, + "compute_only_time_non_pp_ns": 33401088, + "memory_only_time_non_pp_ns": 1085024, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 40563296, + "overlapped_compute_time_chip_ns": 6077184, + "compute_only_time_chip_ns": 33401088, + "memory_only_time_chip_ns": 1085024, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 100977.98758759644, + "TTFT_sec": 0.040563296, + "mem_footprint_GB": 21.5, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv new file mode 100644 index 0000000..461d792 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv @@ -0,0 +1,46 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +3,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,MXU,32,Memory,384268,308424,384268,0,0,0,0,0,0,0,0,308424,51401,0,0,1140850688,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,4096,32)]",137438953472,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",32,35651584,32768,32,4096,4096,128,0,308424,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Softmax(x=1x4096x4096x32,memory_placements=1_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,32,Memory,723328,205604,723328,0,0,0,0,0,0,0,0,0,205604,0,0,2147483648,"DT_BFLOAT16:[1,4096,4096,32]","[DT_BFLOAT16:(1,4096,4096,32)]",2147483648,FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,205604,2147483648,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,99271,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9688932932224383,2764.997345602548,0.2842460643786801,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,MXU,32,Memory,384268,308424,384268,0,0,0,0,0,0,0,0,308424,51401,0,0,1140850688,"DT_BFLOAT16:[1,4096,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",32,9699328,32768,32,4096,128,4096,0,308424,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Add(a=1x4096x32x128,b=1x4096x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,VPU,32,Memory,22604,1607,22604,0,0,0,0,0,0,0,0,0,1607,0,0,67108864,"DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",16777216,FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1607,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-Attention_encoder-Attention_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionlayernormYnormLayerNormy,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,79114,5623,79114,0,0,0,0,0,0,0,0,0,5623,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,5623,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6641,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,22604,1607,22604,0,0,0,0,0,0,0,0,0,1607,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1607,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1897,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumXGrad,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,BwdFFNencoderFFdowneinsumXGrad,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumYGrad,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,BwdFFNencoderFFdowneinsumYGrad,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,114688,1,4096,4096,14336,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Bwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,79114,5623,79114,0,0,0,0,0,0,0,0,0,5623,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,BwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,5623,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6641,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumXGrad,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFgateeinsumXGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumYGrad,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFgateeinsumYGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumXGrad,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFupeinsumXGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumYGrad,MXU,32,Compute,1079436,1079436,90416,0,0,0,0,0,0,0,0,1079436,179902,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFupeinsumYGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 14336], [1, 4096, 14336]]",1,155189248,114688,1,4096,14336,4096,0,1079436,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,185889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.63673728873226,231.60242941684362,0.9770815587304168,0.08376218062091993,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Softmax,"Softmax(x=1x4096x4096,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,22604,6426,22604,0,0,0,0,0,0,0,0,0,6426,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",67108864,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,6426,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9688932932224383,2764.997345602548,0.2842460643786801,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,softmax-bwd-dl/dy-17,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy17MatMuldldy,MXU,1,Compute,25700,25700,22607,0,0,0,0,0,0,0,0,0,25700,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwddldy17MatMuldldy,Einsum,33554432,[],Einsum,"BLV,BLV->BV","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,4098,131072,4096,1,1,4096,0,25700,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,softmax-bwd-e1-17,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde117MatMule1,MXU,1,Compute,25700,25700,22607,0,0,0,0,0,0,0,0,0,25700,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde117MatMule1,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,4098,131072,4096,1,1,4096,0,25700,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,softmax-bwd-e2-17,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde217MatMule2,MXU,1,Compute,25700,25700,22607,0,0,0,0,0,0,0,0,0,25700,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde217MatMule2,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,4098,131072,4096,1,1,4096,0,25700,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,MXU,32,Memory,384268,308424,384268,0,0,0,0,0,0,0,0,308424,51401,0,0,1140850688,"DT_BFLOAT16:[1,4096,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",32,9699328,32768,32,4096,128,4096,0,308424,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,MXU,32,Memory,384268,308424,384268,0,0,0,0,0,0,0,0,308424,51401,0,0,1140850688,"DT_BFLOAT16:[1,4096,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",32,9699328,32768,32,4096,128,4096,0,308424,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Pointwise Mul.(x=1x4096x4096x32,memory_placements=1_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,VPU,32,Memory,723328,51402,723328,0,0,0,0,0,0,0,0,0,51402,0,0,2147483648,"DT_BFLOAT16:[1,4096,4096,32]","[DT_BFLOAT16:(1,4096,4096,32)]",536870912,BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,Pointwise Mul.,0,[],Pointwise Mul.,,,,,0,,,,,0,51402,2147483648,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60721,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,MXU,32,Memory,384268,308424,384268,0,0,0,0,0,0,0,0,308424,51401,0,0,1140850688,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,4096,32)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",32,35651584,32768,32,4096,4096,128,0,308424,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,MXU,32,Memory,384268,308424,384268,0,0,0,0,0,0,0,0,308424,51401,0,0,1140850688,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,4096,32)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",32,35651584,32768,32,4096,4096,128,0,308424,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,76835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,MXU,32,Compute,308424,308424,33906,0,0,0,0,0,0,0,0,308424,51401,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,50331648,32768,1,4096,4096,4096,0,308424,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53647,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.61692174409256,303.96467200996034,0.9770381121255398,0.10993297360215563,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Softmax,"Softmax(x=1x4096x4096,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,22604,6426,22604,0,0,0,0,0,0,0,0,0,6426,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",67108864,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,6426,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9688932932224383,2764.997345602548,0.2842460643786801,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,softmax-bwd-dl/dy-22,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy22MatMuldldy,MXU,1,Compute,25700,25700,22607,0,0,0,0,0,0,0,0,0,25700,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwddldy22MatMuldldy,Einsum,33554432,[],Einsum,"BLV,BLV->BV","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,4098,131072,4096,1,1,4096,0,25700,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,softmax-bwd-e1-22,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde122MatMule1,MXU,1,Compute,25700,25700,22607,0,0,0,0,0,0,0,0,0,25700,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde122MatMule1,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,4098,131072,4096,1,1,4096,0,25700,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,softmax-bwd-e2-22,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde222MatMule2,MXU,1,Compute,25700,25700,22607,0,0,0,0,0,0,0,0,0,25700,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde222MatMule2,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,4098,131072,4096,1,1,4096,0,25700,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1496,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,WeightUpdateOptimizerStates: Load optimizer states from HBM,"Abs(x=62679678976,type=DT_BFLOAT16)",WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,VPU,1,Memory,84448463,6001042,84448463,0,0,0,0,0,0,0,0,0,6001042,0,0,250718715904,DT_BFLOAT16:[62679678976],[DT_BFLOAT16:(62679678976)],0,WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,Input,0,[],Abs,,,,,0,,,,,0,6001042,250718715904,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7089165,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,2764.9999976908994,0.0,0.9999999991648822,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,WeightUpdateWriteHBM,"Abs(x=15669919744,type=DT_BFLOAT16)",WeightUpdateWriteHBMoutput,VPU,1,Memory,21112116,1500261,21112116,0,0,0,0,0,0,0,0,0,1500261,0,0,62679678976,DT_BFLOAT16:[15669919744],[DT_BFLOAT16:(15669919744)],0,WeightUpdateWriteHBMoutput,Output,0,[],Abs,,,,,0,,,,,0,1500261,62679678976,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1772291,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,2764.999964949037,0.0,0.9999999873233406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json new file mode 100644 index 0000000..8d54276 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json @@ -0,0 +1,101 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 663101427, + "overlapped_compute_time_non_pp_ns": 115436293, + "compute_only_time_non_pp_ns": 390271230, + "memory_only_time_non_pp_ns": 157393904, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 663101427, + "overlapped_compute_time_chip_ns": 115436293, + "compute_only_time_chip_ns": 390271230, + "memory_only_time_chip_ns": 157393904, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "total_execution_time_pod_ns": 663101427, + "compute_only_time_pod_ns": 390271230, + "memory_only_time_pod_ns": 157393904, + "ici_bound_time_pod_ns": 0, + "bounded_by_pp_dcn": false, + "total_execution_time_ns": 663101427, + "mem_footprint_GB": 91.5625, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": false, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv new file mode 100644 index 0000000..bfba61b --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv @@ -0,0 +1,46 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +3,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,MXU,32,Memory,143582,131104,143582,0,0,0,0,0,0,0,0,131104,16384,0,0,1140850688,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,4096,32)]",137438953472,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",32,35651584,8192,32,4096,4096,128,0,131104,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Softmax(x=1x4096x4096x32,memory_placements=1_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,32,Memory,270271,131072,270271,0,0,0,0,0,0,0,0,0,131072,0,0,2147483648,"DT_BFLOAT16:[1,4096,4096,32]","[DT_BFLOAT16:(1,4096,4096,32)]",2147483648,FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,131072,2147483648,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,63285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.94566804429628,7399.980020053946,0.48496509059425535,0.9999973000072899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,MXU,32,Memory,143582,131104,143582,0,0,0,0,0,0,0,0,131104,16384,0,0,1140850688,"DT_BFLOAT16:[1,4096,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",32,18350080,8192,32,4096,128,4096,0,131104,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Add(a=1x4096x32x128,b=1x4096x32x128,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,VPU,32,Memory,8446,1024,8446,0,0,0,0,0,0,0,0,0,1024,0,0,67108864,"DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",16777216,FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1024,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-Attention_encoder-Attention_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionlayernormYnormLayerNormy,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,29561,3584,29561,0,0,0,0,0,0,0,0,0,3584,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3584,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 14336], [14336, 4096], [1, 4096, 4096]]",1,67108864,14336,1,4096,4096,14336,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x4096,b=1x4096x4096,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,8446,1024,8446,0,0,0,0,0,0,0,0,0,1024,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",16777216,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1024,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumXGrad,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,BwdFFNencoderFFdowneinsumXGrad,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 14336], [14336, 4096], [1, 4096, 4096]]",1,67108864,14336,1,4096,4096,14336,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x14336,b=14336x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumYGrad,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,14336],DT_BFLOAT16:[14336,4096]","[DT_BFLOAT16:(1,4096,4096)]",481036337152,BwdFFNencoderFFdowneinsumYGrad,Einsum,117440512,[],Einsum,"BLH,HM->BLM","[[1, 4096, 14336], [14336, 4096], [1, 4096, 4096]]",1,67108864,14336,1,4096,4096,14336,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Bwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x14336,b=1x4096x14336,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,29561,3584,29561,0,0,0,0,0,0,0,0,0,3584,0,0,234881024,"DT_BFLOAT16:[1,4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",58720256,BwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3584,234881024,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumXGrad,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFgateeinsumXGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumYGrad,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFgateeinsumYGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumXGrad,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFupeinsumXGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x14336,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumYGrad,MXU,32,Compute,229408,229408,33784,0,0,0,0,0,0,0,0,229408,28672,0,0,268435456,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,14336]","[DT_BFLOAT16:(1,4096,14336)]",481036337152,BwdFFNencoderFFupeinsumYGrad,Einsum,117440512,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 14336], [1, 4096, 14336]]",1,192937984,14336,1,4096,14336,4096,0,229408,268435456,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,61166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.859469382062,1089.7614730087878,0.9921096538606685,0.14726506392010646,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Softmax,"Softmax(x=1x4096x4096,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,8446,4096,8446,0,0,0,0,0,0,0,0,0,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",67108864,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,4096,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1977,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.945638645512669,7399.952640303102,0.4849632962349041,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,softmax-bwd-dl/dy-17,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy17MatMuldldy,MXU,1,Compute,16384,16384,8447,0,0,0,0,0,0,0,0,0,16384,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwddldy17MatMuldldy,Einsum,33554432,[],Einsum,"BLV,BLV->BV","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,8194,65536,4096,1,1,4096,0,16384,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,softmax-bwd-e1-17,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde117MatMule1,MXU,1,Compute,16384,16384,8447,0,0,0,0,0,0,0,0,0,16384,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde117MatMule1,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,8194,65536,4096,1,1,4096,0,16384,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,softmax-bwd-e2-17,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde217MatMule2,MXU,1,Compute,16384,16384,8447,0,0,0,0,0,0,0,0,0,16384,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde217MatMule2,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,8194,65536,4096,1,1,4096,0,16384,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x32x128,b=32x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[32,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",137438953472,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,Einsum,33554432,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 32, 128], [32, 128, 4096], [1, 4096, 4096]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,MXU,32,Memory,143582,131104,143582,0,0,0,0,0,0,0,0,131104,16384,0,0,1140850688,"DT_BFLOAT16:[1,4096,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",32,18350080,8192,32,4096,128,4096,0,131104,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x32,b=1x4096x32x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,MXU,32,Memory,143582,131104,143582,0,0,0,0,0,0,0,0,131104,16384,0,0,1140850688,"DT_BFLOAT16:[1,4096,4096,32],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",32,18350080,8192,32,4096,128,4096,0,131104,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Pointwise Mul.(x=1x4096x4096x32,memory_placements=1_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,VPU,32,Memory,270271,32768,270271,0,0,0,0,0,0,0,0,0,32768,0,0,2147483648,"DT_BFLOAT16:[1,4096,4096,32]","[DT_BFLOAT16:(1,4096,4096,32)]",536870912,BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,Pointwise Mul.,0,[],Pointwise Mul.,,,,,0,,,,,0,32768,2147483648,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38709,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.98641701107407,7399.980020053946,0.12124127264856384,0.9999973000072899,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,MXU,32,Memory,143582,131104,143582,0,0,0,0,0,0,0,0,131104,16384,0,0,1140850688,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,4096,32)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",32,35651584,8192,32,4096,4096,128,0,131104,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x32x128,b=1x4096x32x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,MXU,32,Memory,143582,131104,143582,0,0,0,0,0,0,0,0,131104,16384,0,0,1140850688,"DT_BFLOAT16:[1,4096,32,128],DT_BFLOAT16:[1,4096,32,128]","[DT_BFLOAT16:(1,4096,4096,32)]",137438953472,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",32,35651584,8192,32,4096,4096,128,0,131104,1140850688,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,48988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x32x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,MXU,32,Compute,65568,65568,12669,0,0,0,0,0,0,0,0,65568,8192,0,0,100663296,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,32,128]","[DT_BFLOAT16:(1,4096,32,128)]",137438953472,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,Einsum,33554432,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 32, 128], [1, 4096, 32, 128]]",1,67108864,4096,1,4096,4096,4096,0,65568,100663296,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,17822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.1284997559783,1429.8133235724745,0.9917638023463893,0.19321801669898303,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Softmax,"Softmax(x=1x4096x4096,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,8446,4096,8446,0,0,0,0,0,0,0,0,0,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",67108864,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,4096,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1977,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.945638645512669,7399.952640303102,0.4849632962349041,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,softmax-bwd-dl/dy-22,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy22MatMuldldy,MXU,1,Compute,16384,16384,8447,0,0,0,0,0,0,0,0,0,16384,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwddldy22MatMuldldy,Einsum,33554432,[],Einsum,"BLV,BLV->BV","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,8194,65536,4096,1,1,4096,0,16384,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,softmax-bwd-e1-22,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde122MatMule1,MXU,1,Compute,16384,16384,8447,0,0,0,0,0,0,0,0,0,16384,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde122MatMule1,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,8194,65536,4096,1,1,4096,0,16384,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,softmax-bwd-e2-22,"XlaEinsum(a=1x4096x4096,b=1x4096x4096,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde222MatMule2,MXU,1,Compute,16384,16384,8447,0,0,0,0,0,0,0,0,0,16384,0,0,67117056,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096)]",33554432,softmaxbwde222MatMule2,Einsum,33554432,[],Einsum,"BLN,BLN->BL","[[1, 4096, 4096], [1, 4096, 4096], [1, 4096]]",1,8194,65536,4096,1,1,4096,0,16384,67117056,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,WeightUpdateOptimizerStates: Load optimizer states from HBM,"Abs(x=62679678976,type=DT_BFLOAT16)",WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,VPU,1,Memory,31554055,3825664,31554055,0,0,0,0,0,0,0,0,0,3825664,0,0,250718715904,DT_BFLOAT16:[62679678976],[DT_BFLOAT16:(62679678976)],0,WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,Input,0,[],Abs,,,,,0,,,,,0,3825664,250718715904,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4519343,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,7399.999778158465,0.0,0.9999999700214143,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,WeightUpdateWriteHBM,"Abs(x=15669919744,type=DT_BFLOAT16)",WeightUpdateWriteHBMoutput,VPU,1,Memory,7888514,956416,7888514,0,0,0,0,0,0,0,0,0,956416,0,0,62679678976,DT_BFLOAT16:[15669919744],[DT_BFLOAT16:(15669919744)],0,WeightUpdateWriteHBMoutput,Output,0,[],Abs,,,,,0,,,,,0,956416,62679678976,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1129835,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,7399.999543640285,0.0,0.9999999383297683,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json new file mode 100644 index 0000000..4bf001b --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json @@ -0,0 +1,101 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 178643461, + "overlapped_compute_time_non_pp_ns": 50669690, + "compute_only_time_non_pp_ns": 76700550, + "memory_only_time_non_pp_ns": 51273221, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 178643461, + "overlapped_compute_time_chip_ns": 50669690, + "compute_only_time_chip_ns": 76700550, + "memory_only_time_chip_ns": 51273221, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "total_execution_time_pod_ns": 178643461, + "compute_only_time_pod_ns": 76700550, + "memory_only_time_pod_ns": 51273221, + "ici_bound_time_pod_ns": 0, + "bounded_by_pp_dcn": false, + "total_execution_time_ns": 178643461, + "mem_footprint_GB": 91.5625, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": false, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs32/inference-v6p_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs32/inference-v6p_decode.json new file mode 100644 index 0000000..7f49aae --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs32/inference-v6p_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 6116933632, + "overlapped_compute_time_non_pp_ns": 3413835776, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 2703097856, + "ici_bound_time_non_pp_ns": 0, + "total_execution_time_chip_ns": 6116933632, + "overlapped_compute_time_chip_ns": 3413835776, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 2703097856, + "ici_bound_time_chip_ns": 0, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 11.947136, + "throughput_tokens_per_sec": 2678.466203113449, + "throughput_tokens_per_sec_request": 83.70206884729528, + "mem_footprint_GB": 319.0, + "out_of_memory": true, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 32, + "num_chips": 1, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 1, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 0, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 32, + "microbatch_size_ici": 32, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs32/inference-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 32, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv new file mode 100644 index 0000000..70eaef4 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv @@ -0,0 +1,39 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,52084,23406,52084,0,0,0,0,0,0,0,0,0,23406,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,23406,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288473696336687,599.9923200983028,0.4493839621709985,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,268436,0,104167,268436,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,232.83017181004038,0.0,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,1498149,1498149,104167,0,0,0,0,0,0,0,0,1498149,93622,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,1498149,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198167,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,41.718146859891775,0.9410613256983208,0.0695302447664863,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,1498149,1498149,104167,0,0,0,0,0,0,0,0,1498149,93622,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,1498149,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198167,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,41.718146859891775,0.9410613256983208,0.0695302447664863,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,1498149,1498149,104167,0,0,0,0,0,0,0,0,1498149,93622,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,1498149,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198167,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,41.718146859891775,0.9410613256983208,0.0695302447664863,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,2996298,2996298,104167,0,0,0,0,0,0,0,0,2996298,561735,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,374491,2996298,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,385436,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.092054598040648,20.859073429945887,0.12498470731930822,0.03476512238324315,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,1498149,1498149,130209,0,0,0,0,0,0,0,0,1498149,93622,0,0,83886080,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",2,50331648,16384,1,4096,4096,2048,0,1498149,83886080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200892,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,52.147683574864715,0.9410613256983208,0.08691280595810785,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,134218,2048,52084,134218,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.062499873340386536,232.83017181004038,0.021798225913918296,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,52084,23406,52084,0,0,0,0,0,0,0,0,0,23406,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,23406,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288473696336687,599.9923200983028,0.4493839621709985,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,268436,0,104167,268436,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,232.83017181004038,0.0,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.015274063125625625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,5243063,5243063,286459,0,0,0,0,0,0,0,0,5243063,327680,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,5243063,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,685355,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.873598805888854,32.78141040838151,0.9411436204595764,0.054635684013969184,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,5243063,5243063,286459,0,0,0,0,0,0,0,0,5243063,327680,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,5243063,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,685355,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.873598805888854,32.78141040838151,0.9411436204595764,0.054635684013969184,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,182292,10240,182292,0,0,0,0,0,0,0,0,0,10240,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,10240,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,21633,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.1610609790884954,599.9989028591491,0.05617361156825314,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,5243063,5243063,325521,0,0,0,0,0,0,0,0,5243063,327680,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,5243063,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,689442,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.873598805888854,37.251602736797174,0.9411436204595764,0.06208600456132862,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,134218,2048,52084,134218,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.062499873340386536,232.83017181004038,0.021798225913918296,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,52084,2926,52084,0,0,0,0,0,0,0,0,0,2926,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,2926,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6181,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16105921204208587,599.9923200983028,0.056172995271374815,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.011428571428571429,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,26061,5851,26061,0,0,0,0,0,0,0,0,0,5851,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,5851,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6437671616591842,599.9940175663587,0.013207539260668007,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,52109,11702,52109,0,0,0,0,0,0,0,0,0,11702,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 2048], [2, 2048, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,11702,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439277667965227,599.9972900854459,0.013210834238702297,0.9999954834757432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,26252,5851,26252,0,0,0,0,0,0,0,0,0,5851,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,5851,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390833460307785,599.9880012862116,0.013111446010676098,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,3287,731,3287,0,0,0,0,0,0,0,0,0,731,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,731,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,343,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,103,500,0,0,0,0,0,0,0,0,0,103,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,103,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.2057142857142857,0.91552734375,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,26252,5851,26252,0,0,0,0,0,0,0,0,0,5851,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,5851,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390833460307785,599.9880012862116,0.013111446010676098,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,3287,731,3287,0,0,0,0,0,0,0,0,0,731,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,731,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,343,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0014285714285714286,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,26061,5851,26061,0,0,0,0,0,0,0,0,0,5851,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,5851,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6437671616591842,599.9940175663587,0.013207539260668007,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00021450021450021448,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,12,500,0,0,0,0,0,0,0,0,0,12,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.022857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.015274063125625625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,91181,20480,91181,0,0,0,0,0,0,0,0,0,20480,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,20480,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9540,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.643996622103289,599.9986930935275,0.013212246875477798,0.9999978218225459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,91181,20480,91181,0,0,0,0,0,0,0,0,0,20480,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,20480,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9540,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.643996622103289,599.9986930935275,0.013212246875477798,0.9999978218225459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,3,500,0,0,0,0,0,0,0,0,0,3,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.005,0.08900960286458333,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,91181,20480,91181,0,0,0,0,0,0,0,0,0,20480,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,20480,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9540,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.643996622103289,599.9986930935275,0.013212246875477798,0.9999978218225459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00021450021450021448,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0014285714285714286,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv new file mode 100644 index 0000000..3d25aff --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.csv @@ -0,0 +1,22 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,6,500,0,0,0,0,0,0,0,0,0,6,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.011428571428571429,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,26061,5851,26061,0,0,0,0,0,0,0,0,0,5851,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,5851,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6437671616591842,599.9940175663587,0.013207539260668007,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,52109,11702,52109,0,0,0,0,0,0,0,0,0,11702,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 2048], [2, 2048, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,11702,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6439277667965227,599.9972900854459,0.013210834238702297,0.9999954834757432,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,26252,5851,26252,0,0,0,0,0,0,0,0,0,5851,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,5851,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390833460307785,599.9880012862116,0.013111446010676098,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,3287,731,3287,0,0,0,0,0,0,0,0,0,731,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,731,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,343,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,103,500,0,0,0,0,0,0,0,0,0,103,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,103,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.2057142857142857,0.91552734375,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,26252,5851,26252,0,0,0,0,0,0,0,0,0,5851,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,5851,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2746,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6390833460307785,599.9880012862116,0.013111446010676098,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,3287,731,3287,0,0,0,0,0,0,0,0,0,731,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,731,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,343,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.638013994523882,599.9995394974521,0.013089507174941777,0.9999992324957535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0014285714285714286,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,26061,5851,26061,0,0,0,0,0,0,0,0,0,5851,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,5851,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.6437671616591842,599.9940175663587,0.013207539260668007,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00021450021450021448,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,12,500,0,0,0,0,0,0,0,0,0,12,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.022857142857142857,0.050862630208333336,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.015274063125625625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,91181,20480,91181,0,0,0,0,0,0,0,0,0,20480,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,20480,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9540,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.643996622103289,599.9986930935275,0.013212246875477798,0.9999978218225459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,91181,20480,91181,0,0,0,0,0,0,0,0,0,20480,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,20480,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9540,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.643996622103289,599.9986930935275,0.013212246875477798,0.9999978218225459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,3,500,0,0,0,0,0,0,0,0,0,3,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.005,0.08900960286458333,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,91181,20480,91181,0,0,0,0,0,0,0,0,0,20480,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,20480,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9540,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.643996622103289,599.9986930935275,0.013212246875477798,0.9999978218225459,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00021450021450021448,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0014285714285714286,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json new file mode 100644 index 0000000..f82eeff --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 7479328768, + "overlapped_compute_time_non_pp_ns": 1607892992, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 5598674944, + "ici_bound_time_non_pp_ns": 272760832, + "total_execution_time_chip_ns": 7479328768, + "overlapped_compute_time_chip_ns": 1607892992, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 5598674944, + "ici_bound_time_chip_ns": 272760832, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 14.608064, + "throughput_tokens_per_sec": 68.45534083092736, + "throughput_tokens_per_sec_request": 68.45534083092736, + "mem_footprint_GB": 11.28125, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv new file mode 100644 index 0000000..f14e6ed --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,52084,23406,52084,0,0,0,0,0,0,0,0,0,23406,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,23406,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288473696336687,599.9923200983028,0.4493839621709985,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,268436,0,104167,268436,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,232.83017181004038,0.0,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,1498149,1498149,104167,0,0,0,0,0,0,0,0,1498149,93622,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,1498149,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198167,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,41.718146859891775,0.9410613256983208,0.0695302447664863,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,1498149,1498149,104167,0,0,0,0,0,0,0,0,1498149,93622,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,1498149,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198167,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,41.718146859891775,0.9410613256983208,0.0695302447664863,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,1498149,1498149,104167,0,0,0,0,0,0,0,0,1498149,93622,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,1498149,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,198167,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,41.718146859891775,0.9410613256983208,0.0695302447664863,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,2996298,2996298,104167,0,0,0,0,0,0,0,0,2996298,561735,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,374491,2996298,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,385436,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,6.092054598040648,20.859073429945887,0.12498470731930822,0.03476512238324315,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,1498149,1498149,130209,0,0,0,0,0,0,0,0,1498149,93622,0,0,83886080,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",2,50331648,16384,1,4096,4096,2048,0,1498149,83886080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,200892,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.86958756171783,52.147683574864715,0.9410613256983208,0.08691280595810785,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,134218,2048,52084,134218,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.062499873340386536,232.83017181004038,0.021798225913918296,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,52084,23406,52084,0,0,0,0,0,0,0,0,0,23406,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,23406,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288473696336687,599.9923200983028,0.4493839621709985,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,268436,0,104167,268436,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,232.83017181004038,0.0,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.015274063125625625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,5243063,5243063,286459,0,0,0,0,0,0,0,0,5243063,327680,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,5243063,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,685355,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.873598805888854,32.78141040838151,0.9411436204595764,0.054635684013969184,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,5243063,5243063,286459,0,0,0,0,0,0,0,0,5243063,327680,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,5243063,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,685355,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.873598805888854,32.78141040838151,0.9411436204595764,0.054635684013969184,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,182292,10240,182292,0,0,0,0,0,0,0,0,0,10240,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,10240,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,21633,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.1610609790884954,599.9989028591491,0.05617361156825314,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,5243063,5243063,325521,0,0,0,0,0,0,0,0,5243063,327680,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,5243063,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,689442,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,45.873598805888854,37.251602736797174,0.9411436204595764,0.06208600456132862,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,134218,2048,52084,134218,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5961,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.062499873340386536,232.83017181004038,0.021798225913918296,0.38805028635006733,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,52084,2926,52084,0,0,0,0,0,0,0,0,0,2926,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,2926,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6181,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.16105921204208587,599.9923200983028,0.056172995271374815,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json new file mode 100644 index 0000000..12f3701 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 827688480, + "overlapped_compute_time_non_pp_ns": 48300480, + "compute_only_time_non_pp_ns": 744728544, + "memory_only_time_non_pp_ns": 8914112, + "ici_bound_time_non_pp_ns": 25745344, + "total_execution_time_chip_ns": 827688480, + "overlapped_compute_time_chip_ns": 48300480, + "compute_only_time_chip_ns": 744728544, + "memory_only_time_chip_ns": 8914112, + "ici_bound_time_chip_ns": 25745344, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 4948.72177029696, + "TTFT_sec": 0.82768848, + "mem_footprint_GB": 10.75, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "2", + "num_sa": 2, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 600.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.7, + "sa_dim": 128, + "hbm_size_GB": 16, + "ici_bw_GBps": 125.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 280.0, + "min_power_W": 1.0, + "avg_power_W": 229.0, + "max_power_W": 280.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 28.869, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 296.2083333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.12, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 6.36, + "static_power_hbm_mc_W": 1.908, + "static_power_hbm_phy_W": 2.862, + "static_power_other_W": 21.73, + "dynamic_power_W_per_SA": 22.55530667, + "dynamic_power_W_per_VU": 2.121728, + "dynamic_power_vmem_W": 22.2144, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v2.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv new file mode 100644 index 0000000..90e8037 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv @@ -0,0 +1,39 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,34723,17430,34723,0,0,0,0,0,0,0,0,0,17430,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,17430,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8415,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326919908994038,899.9798404515739,0.5019666282879519,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,1115644,1115644,69445,0,0,0,0,0,0,0,0,1115644,418314,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,278876,1115644,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287027,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,16.36150152557626,56.02145487270133,0.1287719925336119,0.06224606096966814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,557822,557822,86806,0,0,0,0,0,0,0,0,557822,69719,0,0,83886080,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",2,50331648,16384,1,4096,4096,2048,0,557822,83886080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149601,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,140.05363718175332,0.9695773555471956,0.15561515242417034,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,102301,2048,34723,102301,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.08199927664441208,305.47110976432293,0.021297185797356028,0.33941234418258104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,34723,17430,34723,0,0,0,0,0,0,0,0,0,17430,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,17430,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8415,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326919908994038,899.9798404515739,0.5019666282879519,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.010182708750417083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,121528,7626,121528,0,0,0,0,0,0,0,0,0,7626,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,7626,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24159146863274308,899.9983542887236,0.06274711930496361,0.999998171431915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1952205,1952205,217014,0,0,0,0,0,0,0,0,1952205,244017,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,1952205,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,513414,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,100.04712619832446,0.9696627806460731,0.11116347355369385,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,102301,2048,34723,102301,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.08199927664441208,305.47110976432293,0.021297185797356028,0.33941234418258104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,34723,2179,34723,0,0,0,0,0,0,0,0,0,2179,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,2179,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24158649886242547,899.9798404515739,0.06274582853599399,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,5,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.00851063829787234,0.016954210069444444,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0050913543752085415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,17374,4357,17374,0,0,0,0,0,0,0,0,0,4357,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,4357,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9656507424887764,899.9910263495381,0.007600083036844743,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,34740,8714,34740,0,0,0,0,0,0,0,0,0,8714,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 2048], [2, 2048, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,8714,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9658731145653425,899.9786640490069,0.007601833199892951,0.9999762933877854,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,17501,4357,17501,0,0,0,0,0,0,0,0,0,4357,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,4357,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,2192,544,2192,0,0,0,0,0,0,0,0,0,544,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,544,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9567299270072993,899.7255868285242,0.007529872415724256,0.9996950964761381,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,77,500,0,0,0,0,0,0,0,0,0,77,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,77,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,77,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.15319148936170213,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,17501,4357,17501,0,0,0,0,0,0,0,0,0,4357,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,4357,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,2192,544,2192,0,0,0,0,0,0,0,0,0,544,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,544,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9567299270072993,899.7255868285242,0.007529872415724256,0.9996950964761381,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0010638297872340426,0.016954210069444444,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,17374,4357,17374,0,0,0,0,0,0,0,0,0,4357,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,4357,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9656507424887764,899.9910263495381,0.007600083036844743,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00015973420228739376,0.0025456771876042708,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,9,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.01702127659574468,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0050913543752085415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.010182708750417083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,60788,15251,60788,0,0,0,0,0,0,0,0,0,15251,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,15251,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9659843390142792,899.988169292639,0.007602708583725274,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,60788,15251,60788,0,0,0,0,0,0,0,0,0,15251,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,15251,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9659843390142792,899.988169292639,0.007602708583725274,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,3,500,0,0,0,0,0,0,0,0,0,3,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.003723404255319149,0.05933973524305555,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,60788,15251,60788,0,0,0,0,0,0,0,0,0,15251,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,15251,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9659843390142792,899.988169292639,0.007602708583725274,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00015973420228739376,0.0025456771876042708,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0010638297872340426,0.016954210069444444,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv new file mode 100644 index 0000000..9e72ccc --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.csv @@ -0,0 +1,22 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,5,500,0,0,0,0,0,0,0,0,0,5,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,5,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.00851063829787234,0.016954210069444444,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0050913543752085415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,17374,4357,17374,0,0,0,0,0,0,0,0,0,4357,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,4357,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9656507424887764,899.9910263495381,0.007600083036844743,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,34740,8714,34740,0,0,0,0,0,0,0,0,0,8714,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 2048], [2, 2048, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,8714,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9658731145653425,899.9786640490069,0.007601833199892951,0.9999762933877854,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,17501,4357,17501,0,0,0,0,0,0,0,0,0,4357,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,4357,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,2192,544,2192,0,0,0,0,0,0,0,0,0,544,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,544,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9567299270072993,899.7255868285242,0.007529872415724256,0.9996950964761381,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,77,500,0,0,0,0,0,0,0,0,0,77,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,77,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,77,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.15319148936170213,0.6103515625,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,17501,4357,17501,0,0,0,0,0,0,0,0,0,4357,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,4357,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2045,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9586432775269984,899.9991434641234,0.0075449313000480285,0.9999990482934705,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,2192,544,2192,0,0,0,0,0,0,0,0,0,544,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,544,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,256,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9567299270072993,899.7255868285242,0.007529872415724256,0.9996950964761381,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0010638297872340426,0.016954210069444444,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,17374,4357,17374,0,0,0,0,0,0,0,0,0,4357,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,4357,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2030,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9656507424887764,899.9910263495381,0.007600083036844743,0.9999900292772645,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00015973420228739376,0.0025456771876042708,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,9,500,0,0,0,0,0,0,0,0,0,9,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,9,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.01702127659574468,0.03390842013888889,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0050913543752085415,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.010182708750417083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,60788,15251,60788,0,0,0,0,0,0,0,0,0,15251,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,15251,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9659843390142792,899.988169292639,0.007602708583725274,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,60788,15251,60788,0,0,0,0,0,0,0,0,0,15251,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 2048], [2048, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,15251,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9659843390142792,899.988169292639,0.007602708583725274,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,3,500,0,0,0,0,0,0,0,0,0,3,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,3,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.003723404255319149,0.05933973524305555,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,60788,15251,60788,0,0,0,0,0,0,0,0,0,15251,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 3584], [3584, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,15251,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9659843390142792,899.988169292639,0.007602708583725274,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00015973420228739376,0.0025456771876042708,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0010638297872340426,0.016954210069444444,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json new file mode 100644 index 0000000..357882f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 5093588992, + "overlapped_compute_time_non_pp_ns": 1197391872, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 3623436288, + "ici_bound_time_non_pp_ns": 272760832, + "total_execution_time_chip_ns": 5093588992, + "overlapped_compute_time_chip_ns": 1197391872, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 3623436288, + "ici_bound_time_chip_ns": 272760832, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 9.948416, + "throughput_tokens_per_sec": 100.51851470626077, + "throughput_tokens_per_sec_request": 100.51851470626077, + "mem_footprint_GB": 11.28125, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv new file mode 100644 index 0000000..53dff3f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,34723,17430,34723,0,0,0,0,0,0,0,0,0,17430,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,17430,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8415,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326919908994038,899.9798404515739,0.5019666282879519,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,1115644,1115644,69445,0,0,0,0,0,0,0,0,1115644,418314,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,278876,1115644,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,287027,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,16.36150152557626,56.02145487270133,0.1287719925336119,0.06224606096966814,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,557822,557822,86806,0,0,0,0,0,0,0,0,557822,69719,0,0,83886080,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",2,50331648,16384,1,4096,4096,2048,0,557822,83886080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149601,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,140.05363718175332,0.9695773555471956,0.15561515242417034,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,102301,2048,34723,102301,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.08199927664441208,305.47110976432293,0.021297185797356028,0.33941234418258104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,34723,17430,34723,0,0,0,0,0,0,0,0,0,17430,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,17430,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8415,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326919908994038,899.9798404515739,0.5019666282879519,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.010182708750417083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,121528,7626,121528,0,0,0,0,0,0,0,0,0,7626,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,7626,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24159146863274308,899.9983542887236,0.06274711930496361,0.999998171431915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1952205,1952205,217014,0,0,0,0,0,0,0,0,1952205,244017,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,1952205,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,513414,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,100.04712619832446,0.9696627806460731,0.11116347355369385,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,102301,2048,34723,102301,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.08199927664441208,305.47110976432293,0.021297185797356028,0.33941234418258104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,34723,2179,34723,0,0,0,0,0,0,0,0,0,2179,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,2179,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24158649886242547,899.9798404515739,0.06274582853599399,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json new file mode 100644 index 0000000..6cb38ac --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 321484096, + "overlapped_compute_time_non_pp_ns": 32393824, + "compute_only_time_non_pp_ns": 263680032, + "memory_only_time_non_pp_ns": 5793024, + "ici_bound_time_non_pp_ns": 19617216, + "total_execution_time_chip_ns": 321484096, + "overlapped_compute_time_chip_ns": 32393824, + "compute_only_time_chip_ns": 263680032, + "memory_only_time_chip_ns": 5793024, + "ici_bound_time_chip_ns": 19617216, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 12740.910206643628, + "TTFT_sec": 0.321484096, + "mem_footprint_GB": 10.75, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v3.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv new file mode 100644 index 0000000..7ba8b5a --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv @@ -0,0 +1,39 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,26042,15604,26042,0,0,0,0,0,0,0,0,0,15604,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,15604,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,499384,499384,52084,0,0,0,0,0,0,0,0,499384,374490,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,249660,499384,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,128479,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,36.55225439341269,125.15418996203323,0.13075297044346917,0.10429515830169436,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,50331648,16384,1,4096,4096,2048,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,149797,2048,26042,149797,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.05599983978317322,208.61565985967675,0.013020796080536929,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,26042,15604,26042,0,0,0,0,0,0,0,0,0,15604,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,15604,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,91146,6827,91146,0,0,0,0,0,0,0,0,0,6827,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,6827,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3221219581769908,1199.9978057182982,0.07489814875767085,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,149797,2048,26042,149797,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.05599983978317322,208.61565985967675,0.013020796080536929,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,26042,1951,26042,0,0,0,0,0,0,0,0,0,1951,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1951,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32211842408417174,1199.9846401966056,0.07489732702849976,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.007619047619047619,0.012715657552083334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,13031,7833,13031,0,0,0,0,0,0,0,0,7833,1950,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,7833,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287484920574016,1199.9419915430033,0.0046055292774654305,0.9999516596191694,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,26055,15635,26055,0,0,0,0,0,0,0,0,15635,3900,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,15635,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2878308194204569,1199.9715520653426,0.004606766610220843,0.9999762933877855,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,13126,7833,13126,0,0,0,0,0,0,0,0,7833,1950,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,7833,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2873,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,1644,1006,1644,0,0,0,0,0,0,0,0,1006,243,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,1006,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,366,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.275639902676399,1199.6341157713655,0.004563157847829381,0.9996950964761379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,69,500,0,0,0,0,0,0,0,0,0,69,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,69,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.13714285714285715,0.457763671875,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,13126,7833,13126,0,0,0,0,0,0,0,0,7833,1950,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,7833,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2873,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,1644,1006,1644,0,0,0,0,0,0,0,0,1006,243,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,1006,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,366,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.275639902676399,1199.6341157713655,0.004563157847829381,0.9996950964761379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0009523809523809524,0.012715657552083334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,13031,7833,13031,0,0,0,0,0,0,0,0,7833,1950,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,7833,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287484920574016,1199.9419915430033,0.0046055292774654305,0.9999516596191694,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00014300014300014298,0.0019092578907032032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,8,500,0,0,0,0,0,0,0,0,0,8,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.015238095238095238,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,45591,27338,45591,0,0,0,0,0,0,0,0,27338,6826,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,27338,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2879791186857055,1199.9842257235186,0.004607297099236298,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,45591,27338,45591,0,0,0,0,0,0,0,0,27338,6826,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,27338,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2879791186857055,1199.9842257235186,0.004607297099236298,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.0033333333333333335,0.044504801432291664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,45591,27338,45591,0,0,0,0,0,0,0,0,27338,6826,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 7168], [7168, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,27338,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2879791186857055,1199.9842257235186,0.004607297099236298,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00014300014300014298,0.0019092578907032032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0009523809523809524,0.012715657552083334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv new file mode 100644 index 0000000..06f39b7 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.csv @@ -0,0 +1,22 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.007619047619047619,0.012715657552083334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,13031,7833,13031,0,0,0,0,0,0,0,0,7833,1950,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,7833,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287484920574016,1199.9419915430033,0.0046055292774654305,0.9999516596191694,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,26055,15635,26055,0,0,0,0,0,0,0,0,15635,3900,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,15635,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5726,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2878308194204569,1199.9715520653426,0.004606766610220843,0.9999762933877855,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,13126,7833,13126,0,0,0,0,0,0,0,0,7833,1950,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,7833,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2873,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,1644,1006,1644,0,0,0,0,0,0,0,0,1006,243,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,1006,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,366,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.275639902676399,1199.6341157713655,0.004563157847829381,0.9996950964761379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,69,500,0,0,0,0,0,0,0,0,0,69,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,69,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.13714285714285715,0.457763671875,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,13126,7833,13126,0,0,0,0,0,0,0,0,7833,1950,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,7833,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2873,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.278166692061557,1199.976002572423,0.004572196557569101,0.9999800021436859,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,1644,1006,1644,0,0,0,0,0,0,0,0,1006,243,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,1006,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,366,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.275639902676399,1199.6341157713655,0.004563157847829381,0.9996950964761379,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0009523809523809524,0.012715657552083334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,13031,7833,13031,0,0,0,0,0,0,0,0,7833,1950,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,7833,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.287484920574016,1199.9419915430033,0.0046055292774654305,0.9999516596191694,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00014300014300014298,0.0019092578907032032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,8,500,0,0,0,0,0,0,0,0,0,8,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.015238095238095238,0.025431315104166668,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0038185157814064064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,45591,27338,45591,0,0,0,0,0,0,0,0,27338,6826,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,27338,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2879791186857055,1199.9842257235186,0.004607297099236298,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,45591,27338,45591,0,0,0,0,0,0,0,0,27338,6826,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,27338,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2879791186857055,1199.9842257235186,0.004607297099236298,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.0033333333333333335,0.044504801432291664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,45591,27338,45591,0,0,0,0,0,0,0,0,27338,6826,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 7168], [7168, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,27338,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,10014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2879791186857055,1199.9842257235186,0.004607297099236298,0.9999868547695989,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,0.00014300014300014298,0.0019092578907032032,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.0009523809523809524,0.012715657552083334,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json new file mode 100644 index 0000000..aa629d0 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 3900702720, + "overlapped_compute_time_non_pp_ns": 2147614720, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 1480327168, + "ici_bound_time_non_pp_ns": 272760832, + "total_execution_time_chip_ns": 3900702720, + "overlapped_compute_time_chip_ns": 2147614720, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 1480327168, + "ici_bound_time_chip_ns": 272760832, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 7.61856, + "throughput_tokens_per_sec": 131.2584005376344, + "throughput_tokens_per_sec_request": 131.2584005376344, + "mem_footprint_GB": 11.28125, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv new file mode 100644 index 0000000..d713341 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,26042,15604,26042,0,0,0,0,0,0,0,0,0,15604,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,15604,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,499384,499384,52084,0,0,0,0,0,0,0,0,499384,374490,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,249660,499384,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,128479,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,36.55225439341269,125.15418996203323,0.13075297044346917,0.10429515830169436,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,50331648,16384,1,4096,4096,2048,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,149797,2048,26042,149797,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.05599983978317322,208.61565985967675,0.013020796080536929,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,26042,15604,26042,0,0,0,0,0,0,0,0,0,15604,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,15604,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,91146,6827,91146,0,0,0,0,0,0,0,0,0,6827,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,6827,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3221219581769908,1199.9978057182982,0.07489814875767085,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,149797,2048,26042,149797,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.05599983978317322,208.61565985967675,0.013020796080536929,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,26042,1951,26042,0,0,0,0,0,0,0,0,0,1951,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1951,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32211842408417174,1199.9846401966056,0.07489732702849976,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json new file mode 100644 index 0000000..12b85a6 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 166114176, + "overlapped_compute_time_non_pp_ns": 20994112, + "compute_only_time_non_pp_ns": 112246400, + "memory_only_time_non_pp_ns": 4137152, + "ici_bound_time_non_pp_ns": 28736512, + "total_execution_time_chip_ns": 166114176, + "overlapped_compute_time_chip_ns": 20994112, + "compute_only_time_chip_ns": 112246400, + "memory_only_time_chip_ns": 4137152, + "ici_bound_time_chip_ns": 28736512, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 24657.739024031278, + "TTFT_sec": 0.166114176, + "mem_footprint_GB": 10.75, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v4.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv new file mode 100644 index 0000000..8709491 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv @@ -0,0 +1,39 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,11302,6426,11302,0,0,0,0,0,0,0,0,0,6426,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6426,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,308444,308444,22604,0,0,0,0,0,0,0,0,308444,154201,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,102801,308444,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52903,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,59.17965986694505,202.6299749711455,0.1297544602353245,0.07328389691542332,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,50331648,16384,1,4096,4096,2048,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,83887,1366,11302,83887,16777216,16777216,0,0,0,0,0,0,1366,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1366,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1089,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.09999890328656408,372.52494427026835,0.009574037155959336,0.13472873210497952,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,11302,6426,11302,0,0,0,0,0,0,0,0,0,6426,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6426,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.003314444077893445,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,39557,2812,39557,0,0,0,0,0,0,0,0,0,2812,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2812,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,83887,1366,11302,83887,16777216,16777216,0,0,0,0,0,0,1366,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1366,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1089,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.09999890328656408,372.52494427026835,0.009574037155959336,0.13472873210497952,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,11302,804,11302,0,0,0,0,0,0,0,0,0,804,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,804,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.0031372549019607837,0.005518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0016572220389467224,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,5656,4838,5656,0,0,0,0,0,0,0,0,4838,803,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,4838,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.96626874115983,2764.576395296477,0.006503697390073859,0.9998467975755795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,11308,9657,11308,0,0,0,0,0,0,0,0,9657,1606,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,9657,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9673180049522463,2764.8796240769807,0.006505997955121639,0.9999564644039713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,5697,4838,5697,0,0,0,0,0,0,0,0,4838,803,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,4838,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,714,622,714,0,0,0,0,0,0,0,0,622,100,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,622,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9371876750700276,2762.1827539609594,0.006439935650955487,0.9989811045066761,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,29,500,0,0,0,0,0,0,0,0,0,29,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,29,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.05647058823529411,0.1986677780289331,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,5697,4838,5697,0,0,0,0,0,0,0,0,4838,803,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,4838,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,714,622,714,0,0,0,0,0,0,0,0,622,100,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,622,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9371876750700276,2762.1827539609594,0.006439935650955487,0.9989811045066761,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00039215686274509797,0.005518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,5656,4838,5656,0,0,0,0,0,0,0,0,4838,803,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,4838,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.96626874115983,2764.576395296477,0.006503697390073859,0.9998467975755795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,5.888241182358828e-05,0.0008286110194733612,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.0062745098039215675,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0016572220389467224,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.003314444077893445,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,19787,16885,19787,0,0,0,0,0,0,0,0,16885,2811,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,16885,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9676179309647748,2764.8699062496053,0.00650665555839198,0.9999529498190254,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,19787,16885,19787,0,0,0,0,0,0,0,0,16885,2811,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,16885,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9676179309647748,2764.8699062496053,0.00650665555839198,0.9999529498190254,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.001372549019607843,0.019314922863924052,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,19787,16885,19787,0,0,0,0,0,0,0,0,16885,2811,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 7168], [7168, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,16885,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9676179309647748,2764.8699062496053,0.00650665555839198,0.9999529498190254,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,5.888241182358828e-05,0.0008286110194733612,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00039215686274509797,0.005518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv new file mode 100644 index 0000000..8540e35 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.csv @@ -0,0 +1,22 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.0031372549019607837,0.005518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0016572220389467224,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,5656,4838,5656,0,0,0,0,0,0,0,0,4838,803,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,4200448,512,1,1,2048,4096,0,4838,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.96626874115983,2764.576395296477,0.006503697390073859,0.9998467975755795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,11308,9657,11308,0,0,0,0,0,0,0,0,9657,1606,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 16, 128], [1, 2, 1, 16, 128]]",1,8398848,1024,1,1,4096,4096,0,9657,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2357,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9673180049522463,2764.8796240769807,0.006505997955121639,0.9999564644039713,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,5697,4838,5697,0,0,0,0,0,0,0,0,4838,803,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,512,16,1,4096,128,0,4838,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,714,622,714,0,0,0,0,0,0,0,0,622,100,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,64,16,1,512,128,0,622,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9371876750700276,2762.1827539609594,0.006439935650955487,0.9989811045066761,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,29,500,0,0,0,0,0,0,0,0,0,29,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,29,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.05647058823529411,0.1986677780289331,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,5697,4838,5697,0,0,0,0,0,0,0,0,4838,803,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,264448,512,16,1,128,4096,0,4838,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.944921186589433,2764.768300818962,0.006456891774312402,0.9999162028278343,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,714,622,714,0,0,0,0,0,0,0,0,622,100,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,64,16,1,128,512,0,622,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9371876750700276,2762.1827539609594,0.006439935650955487,0.9989811045066761,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00039215686274509797,0.005518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,5656,4838,5656,0,0,0,0,0,0,0,0,4838,803,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,8398848,512,1,1,4096,2048,0,4838,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1180,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.96626874115983,2764.576395296477,0.006503697390073859,0.9998467975755795,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,5.888241182358828e-05,0.0008286110194733612,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,4,500,0,0,0,0,0,0,0,0,0,4,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.0062745098039215675,0.011037098779385171,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.0016572220389467224,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.003314444077893445,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,19787,16885,19787,0,0,0,0,0,0,0,0,16885,2811,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,16885,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9676179309647748,2764.8699062496053,0.00650665555839198,0.9999529498190254,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,19787,16885,19787,0,0,0,0,0,0,0,0,16885,2811,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,14696448,1792,1,1,7168,4096,0,16885,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9676179309647748,2764.8699062496053,0.00650665555839198,0.9999529498190254,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.001372549019607843,0.019314922863924052,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,19787,16885,19787,0,0,0,0,0,0,0,0,16885,2811,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 7168], [7168, 4096], [1, 1, 4096]]",1,8398848,1792,1,1,4096,7168,0,16885,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9676179309647748,2764.8699062496053,0.00650665555839198,0.9999529498190254,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,5.888241182358828e-05,0.0008286110194733612,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00039215686274509797,0.005518549389692586,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json new file mode 100644 index 0000000..dd7c151 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 1875197952, + "overlapped_compute_time_non_pp_ns": 1326268416, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 276168704, + "ici_bound_time_non_pp_ns": 272760832, + "total_execution_time_chip_ns": 1875197952, + "overlapped_compute_time_chip_ns": 1326268416, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 276168704, + "ici_bound_time_chip_ns": 272760832, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 3.662496, + "throughput_tokens_per_sec": 273.03784086044055, + "throughput_tokens_per_sec_request": 273.03784086044055, + "mem_footprint_GB": 11.28125, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv new file mode 100644 index 0000000..bdcc81f --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,11302,6426,11302,0,0,0,0,0,0,0,0,0,6426,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6426,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,308444,308444,22604,0,0,0,0,0,0,0,0,308444,154201,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 128]",,4227072,32768,16,4096,4096,128,102801,308444,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,52903,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,59.17965986694505,202.6299749711455,0.1297544602353245,0.07328389691542332,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,50331648,16384,1,4096,4096,2048,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,83887,1366,11302,83887,16777216,16777216,0,0,0,0,0,0,1366,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1366,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1089,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.09999890328656408,372.52494427026835,0.009574037155959336,0.13472873210497952,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,11302,6426,11302,0,0,0,0,0,0,0,0,0,6426,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6426,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.003314444077893445,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,39557,2812,39557,0,0,0,0,0,0,0,0,0,2812,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2812,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,83887,1366,11302,83887,16777216,16777216,0,0,0,0,0,0,1366,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1366,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1089,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.09999890328656408,372.52494427026835,0.009574037155959336,0.13472873210497952,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,11302,804,11302,0,0,0,0,0,0,0,0,0,804,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,804,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json new file mode 100644 index 0000000..39f1cdc --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 99988128, + "overlapped_compute_time_non_pp_ns": 9113504, + "compute_only_time_non_pp_ns": 72925408, + "memory_only_time_non_pp_ns": 1823840, + "ici_bound_time_non_pp_ns": 16125376, + "total_execution_time_chip_ns": 99988128, + "overlapped_compute_time_chip_ns": 9113504, + "compute_only_time_chip_ns": 72925408, + "memory_only_time_chip_ns": 1823840, + "ici_bound_time_chip_ns": 16125376, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 40964.86334857674, + "TTFT_sec": 0.099988128, + "mem_footprint_GB": 10.75, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv new file mode 100644 index 0000000..794f9d8 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv @@ -0,0 +1,39 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,4223,4096,4223,0,0,0,0,0,0,0,0,0,4096,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4096,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,131136,131136,8446,0,0,0,0,0,0,0,0,131136,81920,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,6356992,8192,16,4096,4096,128,65536,131136,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33737,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,139.19603318692046,476.6044411908248,0.06585931499956493,0.06440600556632767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,67108864,2048,1,4096,4096,2048,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,55925,1024,4223,55925,16777216,16777216,0,0,0,0,0,0,1024,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1024,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14999746088511398,558.7840858292356,0.009155118462226195,0.0755113629498967,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,4223,4096,4223,0,0,0,0,0,0,0,0,0,4096,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4096,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.001238437550726402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,14781,1792,14781,0,0,0,0,0,0,0,0,0,1792,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1792,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9863424666801974,7399.702320546648,0.12123672281983626,0.9999597730468442,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 7168], [7168, 4096], [1, 4096, 4096]]",1,67108864,7168,1,4096,4096,7168,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,55925,1024,4223,55925,16777216,16777216,0,0,0,0,0,0,1024,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1024,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14999746088511398,558.7840858292356,0.009155118462226195,0.0755113629498967,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,4223,512,4223,0,0,0,0,0,0,0,0,0,512,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,512,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,604,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.002,0.0020619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.000619218775363201,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,2114,2080,2114,0,0,0,0,0,0,0,0,2080,256,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,8396800,128,1,1,2048,4096,0,2080,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,758,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.936242194891201,7396.6149913892505,0.00375495955351184,0.9995425664039528,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,4226,4128,4226,0,0,0,0,0,0,0,0,4128,512,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 16, 128], [1, 2, 1, 16, 128]]",1,16789504,256,1,1,4096,4096,0,4128,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.939998106956933,7398.310172518339,0.0037567366285489972,0.9997716449349107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,2129,1024,2129,0,0,0,0,0,0,0,0,0,1024,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,256,16,1,4096,128,0,1024,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8803269140441525,7398.255053905883,0.003728503755812133,0.999764196473768,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,500,128,500,0,0,0,0,0,0,0,0,0,128,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,32,16,1,512,128,0,128,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3944.39697265625,0.0019844961240310078,0.5330266179265203,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,18,500,0,0,0,0,0,0,0,0,0,18,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,18,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.036,0.07423194679054054,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,2129,1024,2129,0,0,0,0,0,0,0,0,0,1024,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,528640,256,16,1,128,4096,0,1024,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8803269140441525,7398.255053905883,0.003728503755812133,0.999764196473768,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,500,128,500,0,0,0,0,0,0,0,0,0,128,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,32,16,1,128,512,0,128,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3944.39697265625,0.0019844961240310078,0.5330266179265203,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00025,0.0020619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,2114,2080,2114,0,0,0,0,0,0,0,0,2080,256,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,16789504,128,1,1,4096,2048,0,2080,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,758,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.936242194891201,7396.6149913892505,0.00375495955351184,0.9995425664039528,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,3.7537537537537536e-05,0.0003096093876816005,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.004,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.000619218775363201,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.001238437550726402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,7394,7200,7394,0,0,0,0,0,0,0,0,7200,896,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,29378560,448,1,1,7168,4096,0,7200,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941608872058426,7399.037169997422,0.003757498747150948,0.9998698878374894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,7394,7200,7394,0,0,0,0,0,0,0,0,7200,896,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,29378560,448,1,1,7168,4096,0,7200,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941608872058426,7399.037169997422,0.003757498747150948,0.9998698878374894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.000875,0.007216994826858108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,7394,7200,7394,0,0,0,0,0,0,0,0,7200,896,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 7168], [7168, 4096], [1, 1, 4096]]",1,16789504,448,1,1,4096,7168,0,7200,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941608872058426,7399.037169997422,0.003757498747150948,0.9998698878374894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,3.7537537537537536e-05,0.0003096093876816005,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00025,0.0020619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv new file mode 100644 index 0000000..6b91033 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.csv @@ -0,0 +1,22 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +16,Attention-serving-decode-Input_layernorm,"LayerNorm(x=1x1x2048,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeInputlayernormXnormLayerNormX,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",16384,AttentionservingdecodeInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.032768,15.2587890625,0.002,0.0020619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,AllGatherMHA-17,AllGather(1x1x2048->1x1x4096),AllGatherMHA17AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA17AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.000619218775363201,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,MXU,16384,Memory,2114,2080,2114,0,0,0,0,0,0,0,0,2080,256,0,0,16789504,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 1, 4096], [4096, 16, 128], [1, 1, 16, 128]]",1,8396800,128,1,1,2048,4096,0,2080,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,758,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.936242194891201,7396.6149913892505,0.00375495955351184,0.9995425664039528,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +18,Attention-serving-decode-Q/K/V,"XlaEinsum(a=1x1x4096,b=2x4096x16x128,eq=BLM;TMND->BTLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,MXU,16384,Memory,4226,4128,4226,0,0,0,0,0,0,0,0,4128,512,0,0,33570816,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[2,4096,16,128]","[DT_BFLOAT16:(1,2,1,16,128)]",33554432,AttentionservingdecodeQKVMatMulKV2xnorm2Wkv1,Einsum,33554432,[],Einsum,"BLM,TMND->BTLND","[[1, 1, 4096], [2, 4096, 16, 128], [1, 2, 1, 16, 128]]",1,16789504,256,1,1,4096,4096,0,4128,33570816,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.939998106956933,7398.310172518339,0.0037567366285489972,0.9997716449349107,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,MXU,16384,Memory,2129,1024,2129,0,0,0,0,0,0,0,0,0,1024,0,0,16912384,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,4096,16)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulQKprefix2Q2Kcache2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 4096, 16, 128], [1, 1, 4096, 16]]",1,1057024,256,16,1,4096,128,0,1024,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8803269140441525,7398.255053905883,0.003728503755812133,0.999764196473768,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x16x128,b=1x512x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,MXU,16384,Memory,500,128,500,0,0,0,0,0,0,0,0,0,128,0,0,2117632,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,512,16)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulQKsuffix2Q2Ksuffix2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 1, 16, 128], [1, 512, 16, 128], [1, 1, 512, 16]]",1,132352,32,16,1,512,128,0,128,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3944.39697265625,0.0019844961240310078,0.5330266179265203,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Softmax(x=1x1x4608x16,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,16384,Memory,500,18,500,0,0,0,0,0,0,0,0,0,18,0,0,294912,"DT_BFLOAT16:[1,1,4608,16]","[DT_BFLOAT16:(1,1,4608,16)]",294912,AttentionservingdecodeSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,18,294912,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.589824,549.31640625,0.036,0.07423194679054054,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,MXU,16384,Memory,2129,1024,2129,0,0,0,0,0,0,0,0,0,1024,0,0,16912384,"DT_BFLOAT16:[1,1,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,1,16,128)]",16777216,AttentionservingdecodeSoftmaxQKVMatMulattnavgprefix2QKprefix2Vcache2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 4096, 16], [1, 4096, 16, 128], [1, 1, 16, 128]]",1,528640,256,16,1,128,4096,0,1024,16912384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,240,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.8803269140441525,7398.255053905883,0.003728503755812133,0.999764196473768,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"XlaEinsum(a=1x1x512x16,b=1x512x16x128,eq=BLSN;BSND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,MXU,16384,Memory,500,128,500,0,0,0,0,0,0,0,0,0,128,0,0,2117632,"DT_BFLOAT16:[1,1,512,16],DT_BFLOAT16:[1,512,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2097152,AttentionservingdecodeSoftmaxQKVMatMulattnavgsuffix2QKsuffix2Vsuffix2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 1, 512, 16], [1, 512, 16, 128], [1, 1, 16, 128]]",1,132352,32,16,1,128,512,0,128,2117632,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,4.194304,3944.39697265625,0.0019844961240310078,0.5330266179265203,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Attention-serving-decode-Softmax(Q*K)*V,"Add(a=1x1x16x128,b=1x1x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,16,128]","[DT_BFLOAT16:(1,1,16,128)]",2048,AttentionservingdecodeSoftmaxQKVattnavgAddBdbeamSNhDBdbeamSNhD,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00025,0.0020619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Attention-serving-decode-Attention_output,"XlaEinsum(a=1x1x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,16384,Memory,2114,2080,2114,0,0,0,0,0,0,0,0,2080,256,0,0,16789504,"DT_BFLOAT16:[1,1,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,1,4096)]",16777216,AttentionservingdecodeAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 1, 16, 128], [16, 128, 4096], [1, 1, 4096]]",1,16789504,128,1,1,4096,2048,0,2080,16789504,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,758,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.936242194891201,7396.6149913892505,0.00375495955351184,0.9995425664039528,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Reduce attention and scatter before LN,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter21,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,3.7537537537537536e-05,0.0003096093876816005,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Attention-serving-decode-Attention_layernorm,"LayerNorm(x=1x1x4096,memory_placements=0_0,type=DT_BFLOAT16)",AttentionservingdecodeAttentionlayernormYnormLayerNormy,VPU,16384,Memory,500,2,500,0,0,0,0,0,0,0,0,0,2,0,0,16384,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,4096)]",32768,AttentionservingdecodeAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,2,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.065536,30.517578125,0.004,0.004123997043918919,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,AllGatherMHA-23,AllGather(1x1x2048->1x1x4096),AllGatherMHA23AllGather,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,8192,8192,0,0,0,0,0,0,0,0,0,16384,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,4096)]",0,AllGatherMHA23AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,16384,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,4.582218937687688,0.0,0.000619218775363201,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP24,ICINoCompute,16384,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP24,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.001238437550726402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Fwd-FFN-serving-decoder-FFgate,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,16384,Memory,7394,7200,7394,0,0,0,0,0,0,0,0,7200,896,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,29378560,448,1,1,7168,4096,0,7200,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941608872058426,7399.037169997422,0.003757498747150948,0.9998698878374894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Fwd-FFN-serving-decoder-FFup,"XlaEinsum(a=1x1x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,MXU,16384,Memory,7394,7200,7394,0,0,0,0,0,0,0,0,7200,896,0,0,58742784,"DT_BFLOAT16:[1,1,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,1,7168)]",58720256,FwdFFNservingdecoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 1, 4096], [4096, 7168], [1, 1, 7168]]",1,29378560,448,1,1,7168,4096,0,7200,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941608872058426,7399.037169997422,0.003757498747150948,0.9998698878374894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,Fwd-FFN-serving-decoder-FFgate_up,"Mul(a=1x1x7168,b=1x1x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,28672,"DT_BFLOAT16:[1,1,7168]","[DT_BFLOAT16:(1,1,7168)]",7168,FwdFFNservingdecoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1,28672,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.014336,53.40576171875,0.000875,0.007216994826858108,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,Fwd-FFN-serving-decoder-FFoutput,"XlaEinsum(a=1x1x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,16384,Memory,7394,7200,7394,0,0,0,0,0,0,0,0,7200,896,0,0,58742784,"DT_BFLOAT16:[1,1,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,1,4096)]",58720256,FwdFFNservingdecoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 1, 7168], [7168, 4096], [1, 1, 4096]]",1,16789504,448,1,1,4096,7168,0,7200,58742784,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.941608872058426,7399.037169997422,0.003757498747150948,0.9998698878374894,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +29,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '1', '4096']->['1', '1', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,VPU,16384,ICI/NVLink,3330,1,500,3330,4096,4096,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,4096]","[DT_BFLOAT16:(1,1,2048)]",2048,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP29,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.000615015015015015,2.291109468843844,3.7537537537537536e-05,0.0003096093876816005,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +30,FFN-serving-decoder-AttnPlusFFn,"Add(a=1x1x2048,b=1x1x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,VPU,16384,Memory,500,1,500,0,0,0,0,0,0,0,0,0,1,0,0,8192,"DT_BFLOAT16:[1,1,2048]","[DT_BFLOAT16:(1,1,2048)]",2048,FFNservingdecoderAttnPlusFFnattnPlusFFnAddBdbeamSMmhBdbeamSMmh,Add,0,[],Add,,,,,0,,,,,0,1,8192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.004096,15.2587890625,0.00025,0.0020619985219594594,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json new file mode 100644 index 0000000..4812f9e --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_decode.json @@ -0,0 +1,98 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 910032896, + "overlapped_compute_time_non_pp_ns": 527859712, + "compute_only_time_non_pp_ns": 0, + "memory_only_time_non_pp_ns": 109412352, + "ici_bound_time_non_pp_ns": 272760832, + "total_execution_time_chip_ns": 910032896, + "overlapped_compute_time_chip_ns": 527859712, + "compute_only_time_chip_ns": 0, + "memory_only_time_chip_ns": 109412352, + "ici_bound_time_chip_ns": 272760832, + "bounded_by_pp_chip": false, + "TPOT_ms_request": 1.777408, + "throughput_tokens_per_sec": 562.6170243410629, + "throughput_tokens_per_sec_request": 562.6170243410629, + "mem_footprint_GB": 11.28125, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv new file mode 100644 index 0000000..99628bd --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.csv @@ -0,0 +1,18 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +2,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,4223,4096,4223,0,0,0,0,0,0,0,0,0,4096,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4096,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +3,Concatenate dims before attn,AllGather(1x4096x2048->1x4096x4096),ConcatenatedimsbeforeattnAllGatherMHA3,ICINoCompute,32,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,ConcatenatedimsbeforeattnAllGatherMHA3,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-Q-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",Q4MatMulQ,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,Q4MatMulQ,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-K-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",K4MatMulK,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,K4MatMulK,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,-V-4,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",V4MatMulV,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,V4MatMulV,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,-FlashAttention-5,"FlashAttention(q=1x4096x16x128,k=1x4096x16x128,v=1x4096x16x128,memory_placements=[0, 0, 0],type=DT_BFLOAT16)",FlashAttention5FlashAttention,MXU,32,Compute,131136,131136,8446,0,0,0,0,0,0,0,0,131136,81920,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,16)]",18253611008,FlashAttention5FlashAttention,FlashAttention,0,[],FlashAttention,,"[4096, 256]",,6356992,8192,16,4096,4096,128,65536,131136,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33737,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,139.19603318692046,476.6044411908248,0.06585931499956493,0.06440600556632767,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,-Attention_output-6,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",Attentionoutput6MatMulattnOutputattnAvgWo,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,Attentionoutput6MatMulattnOutputattnAvgWo,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,67108864,2048,1,4096,4096,2048,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Reduce and split attention results,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandsplitattentionresultsAllReduceReduceScatter7,VPU,32,ICI/NVLink,55925,1024,4223,55925,16777216,16777216,0,0,0,0,0,0,1024,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandsplitattentionresultsAllReduceReduceScatter7,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1024,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14999746088511398,558.7840858292356,0.009155118462226195,0.0755113629498967,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,-Attention_layernorm-8,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",Attentionlayernorm8YnormLayerNormy,VPU,32,Memory,4223,4096,4223,0,0,0,0,0,0,0,0,0,4096,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,Attentionlayernorm8YnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4096,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Gather results after Layernorm,AllGather(1x4096x2048->1x4096x4096),GatherresultsafterLayernormAllReduceAllGather8,ICINoCompute,32,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,GatherresultsafterLayernormAllReduceAllGather8,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP9,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP9,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.001238437550726402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,14781,1792,14781,0,0,0,0,0,0,0,0,0,1792,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1792,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9863424666801974,7399.702320546648,0.12123672281983626,0.9999597730468442,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 7168], [7168, 4096], [1, 4096, 4096]]",1,67108864,7168,1,4096,4096,7168,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,VPU,32,ICI/NVLink,55925,1024,4223,55925,16777216,16777216,0,0,0,0,0,0,1024,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP14,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1024,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14999746088511398,558.7840858292356,0.009155118462226195,0.0755113629498967,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,4223,512,4223,0,0,0,0,0,0,0,0,0,512,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,512,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,604,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json new file mode 100644 index 0000000..cc4607e --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p_prefill.json @@ -0,0 +1,97 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 31130368, + "overlapped_compute_time_non_pp_ns": 3577152, + "compute_only_time_non_pp_ns": 16232128, + "memory_only_time_non_pp_ns": 542528, + "ici_bound_time_non_pp_ns": 10778560, + "total_execution_time_chip_ns": 31130368, + "overlapped_compute_time_chip_ns": 3577152, + "compute_only_time_chip_ns": 16232128, + "memory_only_time_chip_ns": 542528, + "ici_bound_time_chip_ns": 10778560, + "bounded_by_pp_chip": false, + "throughput_tokens_per_sec": 131575.7012573703, + "TTFT_sec": 0.031130368, + "mem_footprint_GB": 10.75, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": true, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.csv new file mode 100644 index 0000000..84c7326 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.csv @@ -0,0 +1,54 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +3,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,34723,17430,34723,0,0,0,0,0,0,0,0,0,17430,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,17430,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8415,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9326919908994038,899.9798404515739,0.5019666282879519,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,AllGatherMHA-4,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA4AllGather,ICINoCompute,32,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA4AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,MXU,32,Compute,557822,557822,69445,0,0,0,0,0,0,0,0,557822,69719,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,557822,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147571,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,112.04290974540265,0.9695773555471956,0.12449212193933629,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,MXU,32,Memory,607639,557822,607639,0,0,0,0,0,0,0,0,557822,69719,0,0,587202560,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 2048, 1, 1], [1, 4096, 2048, 1]]",32,35651584,16384,16,4096,4096,128,0,557822,587202560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,210473,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,113.09260389145528,899.9998354286015,0.890087008282957,0.9999998171428905,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Softmax(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,32,Memory,1111112,278877,1111112,0,0,0,0,0,0,0,0,0,278877,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",1073741824,FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,278877,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,199581,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9663668685065052,899.999280000576,0.2509887353792245,0.99999920000064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,MXU,32,Memory,590278,557822,590278,0,0,0,0,0,0,0,0,557822,69719,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 2048, 1], [1, 2048, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,557822,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,116.41883440683881,899.9996611765981,0.9162658605369804,0.9999996235295535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Add(a=1x4096x16x128,b=1x4096x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,VPU,32,Memory,34723,2179,34723,0,0,0,0,0,0,0,0,0,2179,0,0,33554432,"DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",8388608,FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,2179,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24158649886242547,899.9798404515739,0.06274582853599399,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,32,Compute,557822,557822,86806,0,0,0,0,0,0,0,0,557822,69719,0,0,83886080,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",2,50331648,16384,1,4096,4096,2048,0,557822,83886080,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,149601,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.19248207492713,140.05363718175332,0.9695773555471956,0.15561515242417034,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Reduce attention and scatter before LN,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,VPU,32,ICI/NVLink,102301,2048,34723,102301,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.08199927664441208,305.47110976432293,0.021297185797356028,0.33941234418258104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-Attention_encoder-Attention_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionlayernormYnormLayerNormy,VPU,32,Memory,69445,34860,69445,0,0,0,0,0,0,0,0,0,34860,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,34860,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16831,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9327198214414285,899.9928000575995,0.5019738565495732,0.9999920000639995,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,AllGatherMHA-10,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA10AllGather,ICINoCompute,32,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA10AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP11,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP11,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.010182708750417083,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,121528,7626,121528,0,0,0,0,0,0,0,0,0,7626,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,7626,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24159146863274308,899.9983542887236,0.06274711930496361,0.999998171431915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,1952205,1952205,217014,0,0,0,0,0,0,0,0,1952205,244017,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,1952205,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,513414,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,100.04712619832446,0.9696627806460731,0.11116347355369385,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,VPU,32,ICI/NVLink,102301,2048,34723,102301,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.08199927664441208,305.47110976432293,0.021297185797356028,0.33941234418258104,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,34723,2179,34723,0,0,0,0,0,0,0,0,0,2179,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,2179,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4603,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24158649886242547,899.9798404515739,0.06274582853599399,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumXGrad,MXU,32,Compute,1952205,1952205,217014,0,0,0,0,0,0,0,0,1952205,244017,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumXGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,1952205,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,513414,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,100.04712619832446,0.9696627806460731,0.11116347355369385,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumYGrad,MXU,32,Compute,1952205,1952205,217014,0,0,0,0,0,0,0,0,1952205,244017,0,0,209715200,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumYGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 1024], [1024, 2048], [1, 4096, 2048]]",2,50331648,57344,1,4096,4096,7168,0,1952205,209715200,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,513414,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,100.04712619832446,0.9696627806460731,0.11116347355369385,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,121528,7626,121528,0,0,0,0,0,0,0,0,0,7626,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,BwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,7626,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.24159146863274308,899.9983542887236,0.06274711930496361,0.999998171431915,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumXGrad,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumYGrad,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumXGrad,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumYGrad,MXU,32,Compute,1952205,1952205,190973,0,0,0,0,0,0,0,0,1952205,244017,0,0,184549376,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 256], [256, 3584], [1, 4096, 3584]]",2,81788928,57344,1,4096,7168,4096,0,1952205,184549376,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,510371,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.2033360103063,88.04147105452553,0.9696627806460731,0.0978238567272506,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-,AllReduce(1x4096x4096->1x4096x4096),BwdFFNencoderAllReduce,VPU,1,ICI/NVLink,409201,0,138889,409201,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,BwdFFNencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16232,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.47334928311517,0.0,0.3394148325367946,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,All reduce results of MLP gradient calculation,AllReduce(1x4096x4096->1x4096x4096),AllreduceresultsofMLPgradientcalculationAllReduceMLP19,VPU,32,ICI/NVLink,409201,0,138889,409201,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,AllreduceresultsofMLPgradientcalculationAllReduceMLP19,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,16232,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.47334928311517,0.0,0.3394148325367946,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,34723,8715,34723,0,0,0,0,0,0,0,0,0,8715,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,8715,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6237,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9663459954497019,899.9798404515739,0.25098331414397596,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-dl/dy-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy20MatMuldldy,MXU,1,Compute,34859,34859,34727,0,0,0,0,0,0,0,0,0,34859,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy20MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 1], [1, 4096, 1], [1, 1]]",2048,4098,65536,2048,1,1,4096,0,34859,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.48128793138070514,896.5780629755766,0.0037879412112263853,0.9961978477506407,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e1-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde120MatMule1,MXU,1,Compute,34859,34859,34731,0,0,0,0,0,0,0,0,0,34859,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde120MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 1, 2048], [1, 1, 2048], [1, 1]]",4096,4098,65536,4096,1,1,2048,0,34859,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4059,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.48128793138070514,896.6874951814812,0.0037879412112263853,0.9963194390905347,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e2-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde220MatMule2,MXU,1,Compute,34859,34859,34731,0,0,0,0,0,0,0,0,0,34859,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde220MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 1, 2048], [1, 1, 2048], [1, 1]]",4096,4098,65536,4096,1,1,2048,0,34859,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4059,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.48128793138070514,896.6874951814812,0.0037879412112263853,0.9963194390905347,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 64], [16, 64, 2048], [1, 4096, 2048]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,MXU,32,Memory,590278,557822,590278,0,0,0,0,0,0,0,0,557822,69719,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 2048, 1], [1, 2048, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,557822,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,116.41883440683881,899.9996611765981,0.9162658605369804,0.9999996235295535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,MXU,32,Memory,590278,557822,590278,0,0,0,0,0,0,0,0,557822,69719,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 2048, 1], [1, 2048, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,557822,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,208444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,116.41883440683881,899.9996611765981,0.9162658605369804,0.9999996235295535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Pointwise Mul.(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,VPU,32,Memory,1111112,69720,1111112,0,0,0,0,0,0,0,0,0,69720,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",268435456,BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,Pointwise Mul.,0,[],Pointwise Mul.,,,,,0,,,,,0,69720,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,147292,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.2415917171266263,899.999280000576,0.06274718384480613,0.99999920000064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,MXU,32,Memory,607639,557822,607639,0,0,0,0,0,0,0,0,557822,69719,0,0,587202560,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 2048, 1, 1], [1, 4096, 2048, 1]]",32,35651584,16384,16,4096,4096,128,0,557822,587202560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,210473,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,113.09260389145528,899.9998354286015,0.890087008282957,0.9999998171428905,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,MXU,32,Memory,607639,557822,607639,0,0,0,0,0,0,0,0,557822,69719,0,0,587202560,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 2048, 1, 1], [1, 4096, 2048, 1]]",32,35651584,16384,16,4096,4096,128,0,557822,587202560,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,210473,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,113.09260389145528,899.9998354286015,0.890087008282957,0.9999998171428905,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,MXU,32,Compute,278945,278945,43403,0,0,0,0,0,0,0,0,278945,34859,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 1024], [1024, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,278945,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,74809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,123.17746641094122,140.03656634820484,0.9694591758698806,0.15559618483133872,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Bwd-Attention_encoder-,AllReduce(1x4096x2048->1x4096x2048),BwdAttentionencoderAllReduce,VPU,1,ICI/NVLink,204601,0,69445,204601,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",0,BwdAttentionencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,305.4726027732025,0.0,0.33941400308133607,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,34723,8715,34723,0,0,0,0,0,0,0,0,0,8715,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,8715,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,6237,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.9663459954497019,899.9798404515739,0.25098331414397596,0.9999776005017488,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-dl/dy-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy26MatMuldldy,MXU,1,Compute,34859,34859,34727,0,0,0,0,0,0,0,0,0,34859,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy26MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 1], [1, 4096, 1], [1, 1]]",2048,4098,65536,2048,1,1,4096,0,34859,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.48128793138070514,896.5780629755766,0.0037879412112263853,0.9961978477506407,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e1-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde126MatMule1,MXU,1,Compute,34859,34859,34731,0,0,0,0,0,0,0,0,0,34859,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde126MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 1, 2048], [1, 1, 2048], [1, 1]]",4096,4098,65536,4096,1,1,2048,0,34859,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4059,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.48128793138070514,896.6874951814812,0.0037879412112263853,0.9963194390905347,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e2-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde226MatMule2,MXU,1,Compute,34859,34859,34731,0,0,0,0,0,0,0,0,0,34859,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde226MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 1, 2048], [1, 1, 2048], [1, 1]]",4096,4098,65536,4096,1,1,2048,0,34859,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4059,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.48128793138070514,896.6874951814812,0.0037879412112263853,0.9963194390905347,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,WeightUpdateOptimizerStates: Load optimizer states from HBM,"Abs(x=27044872192,type=DT_BFLOAT16)",WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,VPU,1,Memory,111944445,7024205,111944445,0,0,0,0,0,0,0,0,0,7024205,0,0,108179488768,DT_BFLOAT16:[27044872192],[DT_BFLOAT16:(27044872192)],0,WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,Input,0,[],Abs,,,,,0,,,,,0,7024205,108179488768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,14839651,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,899.9999955334988,0.0,0.9999999950372209,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,WeightUpdateWriteHBM,"Abs(x=6761218048,type=DT_BFLOAT16)",WeightUpdateWriteHBMoutput,VPU,1,Memory,27986112,1756052,27986112,0,0,0,0,0,0,0,0,0,1756052,0,0,27044872192,DT_BFLOAT16:[6761218048],[DT_BFLOAT16:(6761218048)],0,WeightUpdateWriteHBMoutput,Output,0,[],Abs,,,,,0,,,,,0,1756052,27044872192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3709913,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,899.9999714143929,0.0,0.9999999682382144,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.json new file mode 100644 index 0000000..45faeb8 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.json @@ -0,0 +1,101 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 1078156495, + "overlapped_compute_time_non_pp_ns": 207750705, + "compute_only_time_non_pp_ns": 626991112, + "memory_only_time_non_pp_ns": 210089228, + "ici_bound_time_non_pp_ns": 33325450, + "total_execution_time_chip_ns": 1078156495, + "overlapped_compute_time_chip_ns": 207750705, + "compute_only_time_chip_ns": 626991112, + "memory_only_time_chip_ns": 210089228, + "ici_bound_time_chip_ns": 33325450, + "bounded_by_pp_chip": false, + "total_execution_time_pod_ns": 1078156495, + "compute_only_time_pod_ns": 626991112, + "memory_only_time_pod_ns": 210089228, + "ici_bound_time_pod_ns": 33325450, + "bounded_by_pp_dcn": false, + "total_execution_time_ns": 1078156495, + "mem_footprint_GB": 46.296875, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "3", + "num_sa": 4, + "num_vu": 4, + "num_vu_ports": 2, + "hbm_bw_GBps": 900.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 32, + "freq_GHz": 0.94, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 164.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 450.0, + "min_power_W": 175.0, + "avg_power_W": 220.0, + "max_power_W": 262.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 40.478, + "ICI_topology": "TORUS_2D", + "embodied_carbon_kgCO2": 311.8333333, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 2.9866666675, + "static_power_W_per_vu": 0.74127482825, + "static_power_vmem_W": 12.93490069, + "static_power_ici_W": 8.96, + "static_power_hbm_mc_W": 4.032, + "static_power_hbm_phy_W": 6.048, + "static_power_other_W": 37.11333333, + "dynamic_power_W_per_SA": 30.28855467, + "dynamic_power_W_per_VU": 2.8491776, + "dynamic_power_vmem_W": 29.830784, + "dynamic_power_ici_W_per_GBps": 0.0247047779, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 2, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v3.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": false, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.csv new file mode 100644 index 0000000..aa831f2 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.csv @@ -0,0 +1,54 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +3,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,26042,15604,26042,0,0,0,0,0,0,0,0,0,15604,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,15604,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,5717,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,AllGatherMHA-4,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA4AllGather,ICINoCompute,32,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA4AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,MXU,32,Memory,442709,249692,442709,0,0,0,0,0,0,0,0,249692,62415,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,16384,16,4096,4096,128,0,249692,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,155.22493722964748,1199.9981929438977,0.555263196935266,0.9999984941199148,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Softmax(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,32,Memory,833334,249661,833334,0,0,0,0,0,0,0,0,0,249661,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",1073741824,FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,249661,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,120544,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.2884891580086735,1199.999040000768,0.2995929031828203,0.99999920000064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,MXU,32,Memory,442709,249692,442709,0,0,0,0,0,0,0,0,249692,62415,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,249692,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,155.22493722964748,1199.9981929438977,0.555263196935266,0.9999984941199148,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Add(a=1x4096x16x128,b=1x4096x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,VPU,32,Memory,26042,1951,26042,0,0,0,0,0,0,0,0,0,1951,0,0,33554432,"DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",8388608,FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1951,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32211842408417174,1199.9846401966056,0.07489732702849976,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,32,Compute,249692,249692,52084,0,0,0,0,0,0,0,0,249692,62415,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,50331648,16384,1,4096,4096,2048,0,249692,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,66056,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2169742562838,250.30837992406646,0.9844929539272973,0.20859031660338873,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Reduce attention and scatter before LN,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,VPU,32,ICI/NVLink,149797,2048,26042,149797,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.05599983978317322,208.61565985967675,0.013020796080536929,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-Attention_encoder-Attention_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionlayernormYnormLayerNormy,VPU,32,Memory,52084,31208,52084,0,0,0,0,0,0,0,0,0,31208,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,31208,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,11435,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.576947392673374,1199.9846401966056,0.5991786162279981,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,AllGatherMHA-10,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA10AllGather,ICINoCompute,32,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA10AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP11,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP11,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.007637031562812813,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,91146,6827,91146,0,0,0,0,0,0,0,0,0,6827,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,6827,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3221219581769908,1199.9978057182982,0.07489814875767085,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,VPU,32,ICI/NVLink,149797,2048,26042,149797,16777216,16777216,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2328,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.05599983978317322,208.61565985967675,0.013020796080536929,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,26042,1951,26042,0,0,0,0,0,0,0,0,0,1951,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,1951,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32211842408417174,1199.9846401966056,0.07489732702849976,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumXGrad,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumXGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumYGrad,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumYGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,91146,6827,91146,0,0,0,0,0,0,0,0,0,6827,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,BwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,6827,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,8064,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.3221219581769908,1199.9978057182982,0.07489814875767085,0.9999981714319152,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumXGrad,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumYGrad,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumXGrad,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumYGrad,MXU,32,Compute,873844,873844,117188,0,0,0,0,0,0,0,0,873844,218453,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,873844,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,226635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.2415403390079,160.92689312966615,0.984580830539606,0.1341057442747218,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-,AllReduce(1x4096x4096->1x4096x4096),BwdFFNencoderAllReduce,VPU,1,ICI/NVLink,599187,0,104167,599187,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,BwdFFNencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61600802420614,0.0,0.17384667335350512,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,All reduce results of MLP gradient calculation,AllReduce(1x4096x4096->1x4096x4096),AllreduceresultsofMLPgradientcalculationAllReduceMLP19,VPU,32,ICI/NVLink,599187,0,104167,599187,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,AllreduceresultsofMLPgradientcalculationAllReduceMLP19,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61600802420614,0.0,0.17384667335350512,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,26042,7802,26042,0,0,0,0,0,0,0,0,0,7802,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,7802,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288473696336687,1199.9846401966056,0.29958930811399903,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-dl/dy-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy20MatMuldldy,MXU,1,Compute,31207,31207,26045,0,0,0,0,0,0,0,0,0,31207,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy20MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 2048], [1, 4096, 2048], [1, 2048]]",1,4098,65536,2048,1,1,4096,0,31207,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1816,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.537610664274041,1001.5001344975686,0.001923115070806294,0.8345834454146406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e1-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde120MatMule1,MXU,1,Compute,31207,31207,26049,0,0,0,0,0,0,0,0,0,31207,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde120MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,31207,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1817,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.537610664274041,1001.6223730102621,0.001923115070806294,0.8346853108418851,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e2-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde220MatMule2,MXU,1,Compute,31207,31207,26049,0,0,0,0,0,0,0,0,0,31207,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde220MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,31207,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1817,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.537610664274041,1001.6223730102621,0.001923115070806294,0.8346853108418851,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 2048], [1, 4096, 2048]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 2048], [1, 4096, 2048]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,MXU,32,Memory,442709,249692,442709,0,0,0,0,0,0,0,0,249692,62415,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,249692,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,155.22493722964748,1199.9981929438977,0.555263196935266,0.9999984941199148,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,MXU,32,Memory,442709,249692,442709,0,0,0,0,0,0,0,0,249692,62415,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,249692,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,155.22493722964748,1199.9981929438977,0.555263196935266,0.9999984941199148,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Pointwise Mul.(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,VPU,32,Memory,833334,62416,833334,0,0,0,0,0,0,0,0,0,62416,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",268435456,BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,Pointwise Mul.,0,[],Pointwise Mul.,,,,,0,,,,,0,62416,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,73732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.32212228950216837,1199.999040000768,0.07489822579570507,0.99999920000064,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,MXU,32,Memory,442709,249692,442709,0,0,0,0,0,0,0,0,249692,62415,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,16384,16,4096,4096,128,0,249692,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,155.22493722964748,1199.9981929438977,0.555263196935266,0.9999984941199148,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,MXU,32,Memory,442709,249692,442709,0,0,0,0,0,0,0,0,249692,62415,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,16384,16,4096,4096,128,0,249692,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93303,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,155.22493722964748,1199.9981929438977,0.555263196935266,0.9999984941199148,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,MXU,32,Compute,124861,124861,32553,0,0,0,0,0,0,0,0,124861,31207,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,124861,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,33485,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,275.1839114535363,312.8478868501774,0.9843746832558392,0.2607065723751478,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Bwd-Attention_encoder-,AllReduce(1x4096x2048->1x4096x2048),BwdAttentionencoderAllReduce,VPU,1,ICI/NVLink,299594,0,52084,299594,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",0,BwdAttentionencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3633,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,208.61565985967675,0.0,0.17384638321639728,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,26042,7802,26042,0,0,0,0,0,0,0,0,0,7802,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,7802,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.288473696336687,1199.9846401966056,0.29958930811399903,0.999987200163838,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-dl/dy-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy26MatMuldldy,MXU,1,Compute,31207,31207,26045,0,0,0,0,0,0,0,0,0,31207,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy26MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 2048], [1, 4096, 2048], [1, 2048]]",1,4098,65536,2048,1,1,4096,0,31207,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1816,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.537610664274041,1001.5001344975686,0.001923115070806294,0.8345834454146406,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e1-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde126MatMule1,MXU,1,Compute,31207,31207,26049,0,0,0,0,0,0,0,0,0,31207,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde126MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,31207,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1817,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.537610664274041,1001.6223730102621,0.001923115070806294,0.8346853108418851,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e2-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde226MatMule2,MXU,1,Compute,31207,31207,26049,0,0,0,0,0,0,0,0,0,31207,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde226MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,31207,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1817,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.537610664274041,1001.6223730102621,0.001923115070806294,0.8346853108418851,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,WeightUpdateOptimizerStates: Load optimizer states from HBM,"Abs(x=27044872192,type=DT_BFLOAT16)",WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,VPU,1,Memory,83958334,6288336,83958334,0,0,0,0,0,0,0,0,0,6288336,0,0,108179488768,DT_BFLOAT16:[27044872192],[DT_BFLOAT16:(27044872192)],0,WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,Input,0,[],Abs,,,,,0,,,,,0,6288336,108179488768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,7428552,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1199.9999904714641,0.0,0.9999999920595535,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,WeightUpdateWriteHBM,"Abs(x=6761218048,type=DT_BFLOAT16)",WeightUpdateWriteHBMoutput,VPU,1,Memory,20989584,1572084,20989584,0,0,0,0,0,0,0,0,0,1572084,0,0,27044872192,DT_BFLOAT16:[6761218048],[DT_BFLOAT16:(6761218048)],0,WeightUpdateWriteHBMoutput,Output,0,[],Abs,,,,,0,,,,,0,1572084,27044872192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1857138,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,1199.9999618858574,0.0,0.9999999682382145,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.json new file mode 100644 index 0000000..757f40e --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.json @@ -0,0 +1,101 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 618053225, + "overlapped_compute_time_non_pp_ns": 116900950, + "compute_only_time_non_pp_ns": 266872556, + "memory_only_time_non_pp_ns": 185470442, + "ici_bound_time_non_pp_ns": 48809277, + "total_execution_time_chip_ns": 618053225, + "overlapped_compute_time_chip_ns": 116900950, + "compute_only_time_chip_ns": 266872556, + "memory_only_time_chip_ns": 185470442, + "ici_bound_time_chip_ns": 48809277, + "bounded_by_pp_chip": false, + "total_execution_time_pod_ns": 618053225, + "compute_only_time_pod_ns": 266872556, + "memory_only_time_pod_ns": 185470442, + "ici_bound_time_pod_ns": 48809277, + "bounded_by_pp_dcn": false, + "total_execution_time_ns": 618053225, + "mem_footprint_GB": 46.296875, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "4", + "num_sa": 8, + "num_vu": 4, + "num_vu_ports": 4, + "hbm_bw_GBps": 1200.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.05, + "sa_dim": 128, + "hbm_size_GB": 32, + "ici_bw_GBps": 112.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 300.0, + "min_power_W": 121.0, + "avg_power_W": 170.0, + "max_power_W": 192.0, + "HBM_GBps_per_W": 65.0, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 366.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.222, + "static_power_W_per_vu": 0.427282, + "static_power_vmem_W": 21.777552, + "static_power_ici_W": 5.499, + "static_power_hbm_mc_W": 4.006409544, + "static_power_hbm_phy_W": 6.009614316, + "static_power_other_W": 41.22229614, + "dynamic_power_W_per_SA": 16.91648, + "dynamic_power_W_per_VU": 1.591296, + "dynamic_power_vmem_W": 30.110208, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01538461538, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v4.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": false, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv new file mode 100644 index 0000000..f2c7526 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv @@ -0,0 +1,54 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +3,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,11302,6426,11302,0,0,0,0,0,0,0,0,0,6426,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,6426,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,AllGatherMHA-4,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA4AllGather,ICINoCompute,32,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA4AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,29360128,16384,1,4096,2048,4096,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,MXU,32,Memory,192134,154222,192134,0,0,0,0,0,0,0,0,154222,25700,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,16384,16,4096,4096,128,0,154222,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Softmax(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,32,Memory,361664,102802,361664,0,0,0,0,0,0,0,0,0,102802,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",1073741824,FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,102802,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,49635,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9688932932224383,2764.997345602548,0.2842460643786801,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,MXU,32,Memory,192134,154222,192134,0,0,0,0,0,0,0,0,154222,25700,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,154222,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Add(a=1x4096x16x128,b=1x4096x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,VPU,32,Memory,11302,804,11302,0,0,0,0,0,0,0,0,0,804,0,0,33554432,"DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",8388608,FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,804,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,32,Compute,154222,154222,22604,0,0,0,0,0,0,0,0,154222,25700,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,50331648,16384,1,4096,4096,2048,0,154222,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,27199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5880272334686,405.259949942291,0.9769747594189139,0.14656779383084664,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Reduce attention and scatter before LN,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,VPU,32,ICI/NVLink,83887,1366,11302,83887,16777216,16777216,0,0,0,0,0,0,1366,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1366,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1089,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.09999890328656408,372.52494427026835,0.009574037155959336,0.13472873210497952,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-Attention_encoder-Attention_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionlayernormYnormLayerNormy,VPU,32,Memory,22604,12851,22604,0,0,0,0,0,0,0,0,0,12851,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,12851,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,5.937786586444877,2764.997345602548,0.5684921287573602,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,AllGatherMHA-10,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA10AllGather,ICINoCompute,32,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA10AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP11,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP11,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.003314444077893445,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,39557,2812,39557,0,0,0,0,0,0,0,0,0,2812,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2812,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,VPU,32,ICI/NVLink,83887,1366,11302,83887,16777216,16777216,0,0,0,0,0,0,1366,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1366,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1089,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.09999890328656408,372.52494427026835,0.009574037155959336,0.13472873210497952,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,11302,804,11302,0,0,0,0,0,0,0,0,0,804,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,804,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,948,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumXGrad,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumXGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumYGrad,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumYGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 3584], [3584, 4096], [1, 4096, 4096]]",1,50331648,57344,1,4096,4096,7168,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,39557,2812,39557,0,0,0,0,0,0,0,0,0,2812,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,BwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,2812,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumXGrad,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumYGrad,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumXGrad,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumYGrad,MXU,32,Compute,539728,539728,50859,0,0,0,0,0,0,0,0,539728,89951,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 2048], [2048, 7168], [1, 4096, 7168]]",1,81788928,57344,1,4096,7168,4096,0,539728,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,93320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.6284805976344,260.5479056117155,0.9770634555088177,0.0942307072736765,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-,AllReduce(1x4096x4096->1x4096x4096),BwdFFNencoderAllReduce,VPU,1,ICI/NVLink,335545,0,45208,335545,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,BwdFFNencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2991,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5282748960646,0.0,0.13472993667127112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,All reduce results of MLP gradient calculation,AllReduce(1x4096x4096->1x4096x4096),AllreduceresultsofMLPgradientcalculationAllReduceMLP19,VPU,32,ICI/NVLink,335545,0,45208,335545,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,AllreduceresultsofMLPgradientcalculationAllReduceMLP19,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2991,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5282748960646,0.0,0.13472993667127112,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,11302,3213,11302,0,0,0,0,0,0,0,0,0,3213,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,3213,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9688932932224383,2764.997345602548,0.2842460643786801,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-dl/dy-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy20MatMuldldy,MXU,1,Compute,12850,12850,11304,0,0,0,0,0,0,0,0,0,12850,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy20MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 2048], [1, 4096, 2048], [1, 2048]]",1,4098,65536,2048,1,1,4096,0,12850,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e1-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde120MatMule1,MXU,1,Compute,12850,12850,11305,0,0,0,0,0,0,0,0,0,12850,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde120MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,12850,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.5003419868676,0.0028626391002535214,0.8797469591272578,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e2-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde220MatMule2,MXU,1,Compute,12850,12850,11305,0,0,0,0,0,0,0,0,0,12850,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde220MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,12850,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.5003419868676,0.0028626391002535214,0.8797469591272578,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 2048], [1, 4096, 2048]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 2048], [1, 4096, 2048]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,MXU,32,Memory,192134,154222,192134,0,0,0,0,0,0,0,0,154222,25700,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,154222,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,MXU,32,Memory,192134,154222,192134,0,0,0,0,0,0,0,0,154222,25700,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,9699328,16384,16,4096,128,4096,0,154222,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Pointwise Mul.(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,VPU,32,Memory,361664,25701,361664,0,0,0,0,0,0,0,0,0,25701,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",268435456,BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,Pointwise Mul.,0,[],Pointwise Mul.,,,,,0,,,,,0,25701,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30360,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.7422233233056096,2764.997345602548,0.07106151609467003,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,MXU,32,Memory,192134,154222,192134,0,0,0,0,0,0,0,0,154222,25700,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,16384,16,4096,4096,128,0,154222,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,MXU,32,Memory,192134,154222,192134,0,0,0,0,0,0,0,0,154222,25700,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,16384,16,4096,4096,128,0,154222,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,38419,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,357.6643214423267,2764.997345602548,0.7841974941816843,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,MXU,32,Compute,77120,77120,14128,0,0,0,0,0,0,0,0,77120,12850,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,29360128,8192,1,4096,2048,2048,0,77120,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,13788,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,445.5360265560166,506.5158195020747,0.9768607452483387,0.18318836148357132,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Bwd-Attention_encoder-,AllReduce(1x4096x2048->1x4096x2048),BwdAttentionencoderAllReduce,VPU,1,ICI/NVLink,167773,0,22604,167773,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",0,BwdAttentionencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,372.5271646808485,0.0,0.13472953514678065,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,11302,3213,11302,0,0,0,0,0,0,0,0,0,3213,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,3213,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1551,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.9688932932224383,2764.997345602548,0.2842460643786801,0.9999990400009215,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-dl/dy-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy26MatMuldldy,MXU,1,Compute,12850,12850,11304,0,0,0,0,0,0,0,0,0,12850,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy26MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 2048], [1, 4096, 2048], [1, 2048]]",1,4098,65536,2048,1,1,4096,0,12850,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.20347838643,0.0028626391002535214,0.8796395943531393,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e1-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde126MatMule1,MXU,1,Compute,12850,12850,11305,0,0,0,0,0,0,0,0,0,12850,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde126MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,12850,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.5003419868676,0.0028626391002535214,0.8797469591272578,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e2-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde226MatMule2,MXU,1,Compute,12850,12850,11305,0,0,0,0,0,0,0,0,0,12850,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde226MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,4098,65536,4096,1,1,2048,0,12850,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,748,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.3056199221789884,2432.5003419868676,0.0028626391002535214,0.8797469591272578,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,WeightUpdateOptimizerStates: Load optimizer states from HBM,"Abs(x=27044872192,type=DT_BFLOAT16)",WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,VPU,1,Memory,36437614,2589315,36437614,0,0,0,0,0,0,0,0,0,2589315,0,0,108179488768,DT_BFLOAT16:[27044872192],[DT_BFLOAT16:(27044872192)],0,WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,Input,0,[],Abs,,,,,0,,,,,0,2589315,108179488768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3058815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,2764.999925626305,0.0,0.9999999731017378,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,WeightUpdateWriteHBM,"Abs(x=6761218048,type=DT_BFLOAT16)",WeightUpdateWriteHBMoutput,VPU,1,Memory,9109404,647329,9109404,0,0,0,0,0,0,0,0,0,647329,0,0,27044872192,DT_BFLOAT16:[6761218048],[DT_BFLOAT16:(6761218048)],0,WeightUpdateWriteHBMoutput,Output,0,[],Abs,,,,,0,,,,,0,647329,27044872192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,764704,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,2764.999773860068,0.0,0.9999999182134063,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json new file mode 100644 index 0000000..0d82e5e --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.json @@ -0,0 +1,101 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 332401272, + "overlapped_compute_time_non_pp_ns": 59126802, + "compute_only_time_non_pp_ns": 173776600, + "memory_only_time_non_pp_ns": 72131736, + "ici_bound_time_non_pp_ns": 27366134, + "total_execution_time_chip_ns": 332401272, + "overlapped_compute_time_chip_ns": 59126802, + "compute_only_time_chip_ns": 173776600, + "memory_only_time_chip_ns": 72131736, + "ici_bound_time_chip_ns": 27366134, + "bounded_by_pp_chip": false, + "total_execution_time_pod_ns": 332401272, + "compute_only_time_pod_ns": 173776600, + "memory_only_time_pod_ns": 72131736, + "ici_bound_time_pod_ns": 27366134, + "bounded_by_pp_dcn": false, + "total_execution_time_ns": 332401272, + "mem_footprint_GB": 46.296875, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "5p", + "num_sa": 8, + "num_vu": 6, + "num_vu_ports": 6, + "hbm_bw_GBps": 2765.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 128, + "freq_GHz": 1.7, + "sa_dim": 128, + "hbm_size_GB": 95, + "ici_bw_GBps": 200.0, + "dcn_bw_GBps": 25.0, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 585.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.35868996, + "static_power_W_per_vu": 0.475076728, + "static_power_vmem_W": 24.21353615, + "static_power_ici_W": 6.114104803, + "static_power_hbm_mc_W": 10.264041296, + "static_power_hbm_phy_W": 15.396061944, + "static_power_other_W": 44.82811018, + "dynamic_power_W_per_SA": 28.19413333, + "dynamic_power_W_per_VU": 2.65216, + "dynamic_power_vmem_W": 50.18368, + "dynamic_power_ici_W_per_GBps": 0.01767315271, + "dynamic_power_hbm_W_per_GBps": 0.01261538462, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v5p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": false, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv new file mode 100644 index 0000000..466d655 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv @@ -0,0 +1,54 @@ +Fusion index,Description,Config,Name,OpType,Count,Bounded-by,Execution time,Compute time,Memory time,ICI/NVLink time,ICI/NVLink outbound traffic,ICI/NVLink inbound traffic,Aggregated DCN time,DCN 0 time,PCIe time,Temporary memory size,Persistent memory size,MXU time,VPU time,Transpose time,Permute time,Bytes accessed,Input Tensor Shapes,Output Tensor Shapes,FLOP Count,Op Name,Op Code,Weight Size,Output Shape,parsed_op_type,dim_labels,tile_shapes,num_tiles,max_vmem_demand_bytes,num_mxu_ops,einsum_B_size,einsum_M_size,einsum_N_size,einsum_K_size,vu_softmax_time_ns,Compute Time (ns),Bytes Accessed,DVFS SA Policy,DVFS SA Voltage (V),DVFS SA Frequency (GHz),DVFS VU Policy,DVFS VU Voltage (V),DVFS VU Frequency (GHz),DVFS SRAM Policy,DVFS SRAM Voltage (V),DVFS SRAM Frequency (GHz),DVFS HBM Policy,DVFS HBM Voltage (V),DVFS HBM Frequency (GHz),DVFS ICI Policy,DVFS ICI Voltage (V),DVFS ICI Frequency (GHz),Vmem time,static_energy_sa_J,static_energy_vu_J,static_energy_sram_J,static_energy_hbm_J,static_energy_ici_J,static_energy_other_J,dynamic_energy_sa_J,dynamic_energy_vu_J,dynamic_energy_sram_J,dynamic_energy_hbm_J,dynamic_energy_ici_J,dynamic_energy_other_J,static_energy_J,dynamic_energy_J,total_energy_J,static_power_W,dynamic_power_W,total_power_W,num_setpm_sa,num_setpm_vu,num_setpm_sram,num_setpm_hbm,num_setpm_ici,tflops_per_sec,hbm_bw_GBps,flops_util,hbm_bw_util,DVFS SA Scaling Time (ns),DVFS SA Power Efficiency (%),DVFS VU Scaling Time (ns),DVFS VU Power Efficiency (%),DVFS SRAM Scaling Time (ns),DVFS SRAM Power Efficiency (%),DVFS HBM Scaling Time (ns),DVFS HBM Power Efficiency (%),DVFS ICI Scaling Time (ns),DVFS ICI Power Efficiency (%),DVFS SA Activity Factor,DVFS VU Activity Factor,DVFS SRAM Activity Factor,DVFS HBM Activity Factor,DVFS ICI Activity Factor +3,Fwd-Attention-encoder-Input_layernorm,"LayerNorm(x=1x4096x2048,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderInputlayernormXnormLayerNormX,VPU,32,Memory,4223,4096,4223,0,0,0,0,0,0,0,0,0,4096,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",67108864,FwdAttentionencoderInputlayernormXnormLayerNormX,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,4096,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +4,AllGatherMHA-4,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA4AllGather,ICINoCompute,32,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA4AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +5,Fwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x4096,b=4096x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1,Einsum,16777216,[],Einsum,"BLM,MND->BLND","[[1, 4096, 4096], [4096, 16, 128], [1, 4096, 16, 128]]",1,41943040,2048,1,4096,2048,4096,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,MXU,32,Memory,71791,65568,71791,0,0,0,0,0,0,0,0,65568,8192,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,4096,16,4096,4096,128,0,65568,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Softmax(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,VPU,32,Memory,135136,65536,135136,0,0,0,0,0,0,0,0,0,65536,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",1073741824,FwdAttentionencoderAttentionSoftmaxQKVattnWeightsSoftmaxattnWeights,Softmax,0,[],Softmax,,,,,0,,,,,0,65536,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,31642,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.945638645512669,7399.952640303102,0.4849632962349041,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,MXU,32,Memory,71791,65568,71791,0,0,0,0,0,0,0,0,65568,8192,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,FwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,18350080,4096,16,4096,128,4096,0,65568,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +6,Fwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Add(a=1x4096x16x128,b=1x4096x16x128,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,VPU,32,Memory,4223,512,4223,0,0,0,0,0,0,0,0,0,512,0,0,33554432,"DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",8388608,FwdAttentionencoderAttentionSoftmaxQKVattnavgAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,512,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,604,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +7,Fwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x4096,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,MXU,32,Compute,32800,32800,8446,0,0,0,0,0,0,0,0,32800,4096,0,0,67108864,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,4096]","[DT_BFLOAT16:(1,4096,4096)]",68719476736,FwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1,Einsum,16777216,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 4096], [1, 4096, 4096]]",1,67108864,2048,1,4096,4096,2048,0,32800,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,9153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2095.1059980487803,1905.4878048780488,0.9912800151257325,0.25749835201054716,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +8,Reduce attention and scatter before LN,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,VPU,32,ICI/NVLink,55925,1024,4223,55925,16777216,16777216,0,0,0,0,0,0,1024,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceattentionandscatterbeforeLNAllReduceMHAReduceScatter8,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1024,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14999746088511398,558.7840858292356,0.009155118462226195,0.0755113629498967,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +9,Fwd-Attention_encoder-Attention_layernorm,"LayerNorm(x=1x4096x4096,memory_placements=0_0,type=DT_BFLOAT16)",FwdAttentionencoderAttentionlayernormYnormLayerNormy,VPU,32,Memory,8446,8192,8446,0,0,0,0,0,0,0,0,0,8192,0,0,67108864,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",134217728,FwdAttentionencoderAttentionlayernormYnormLayerNormy,LayerNorm,0,[],LayerNorm,,,,,0,,,,,0,8192,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,3001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,15.891277291025338,7399.952640303102,0.9699265924698082,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +10,AllGatherMHA-10,AllGather(1x4096x2048->1x4096x4096),AllGatherMHA10AllGather,ICINoCompute,32,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,4096)]",0,AllGatherMHA10AllGather,AllGather,0,[],AllGather,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +11,All gather input of MLP before ffn,AllGather(1x4096->1x8192),AllgatherinputofMLPbeforeffnAllGatherMLP11,ICINoCompute,32,ICI/NVLink,3330,0,500,3330,16384,16384,0,0,0,0,0,0,0,0,0,32768,"DT_BFLOAT16:[1,4096]","[DT_BFLOAT16:(1,8192)]",0,AllgatherinputofMLPbeforeffnAllGatherMLP11,AllGather,0,[],AllGather,,,,,0,,,,,0,0,32768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,9.164437875375375,0.0,0.001238437550726402,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +12,Fwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFgateMatMulhgate2ynorm2WFFgate1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +13,Fwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,FwdFFNencoderFFupMatMulhup2ynorm2WFFup1,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +14,Fwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,14781,1792,14781,0,0,0,0,0,0,0,0,0,1792,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,FwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1792,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9863424666801974,7399.702320546648,0.12123672281983626,0.9999597730468442,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +15,Fwd-FFN-encoder-FFoutput,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,FwdFFNencoderFFoutputMatMulffdown2hgateup2WFFdown1,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 7168], [7168, 4096], [1, 4096, 4096]]",1,67108864,7168,1,4096,4096,7168,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +16,Reduce and scatter results of MLP before residual,"ReduceScatter(['1', '4096', '4096']->['1', '4096', '2048'])",ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,VPU,32,ICI/NVLink,55925,1024,4223,55925,16777216,16777216,0,0,0,0,0,0,1024,0,0,33554432,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,2048)]",8388608,ReduceandscatterresultsofMLPbeforeresidualReduceScatterMLP16,ReduceScatter,0,[],ReduceScatter,,,,,0,,,,,0,1024,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,732,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.14999746088511398,558.7840858292356,0.009155118462226195,0.0755113629498967,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +17,Fwd-FFN-encoder-AttnPlusFFn,"Add(a=1x4096x2048,b=1x4096x2048,memory_placements=0_0_0,type=DT_BFLOAT16)",FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,VPU,32,Memory,4223,512,4223,0,0,0,0,0,0,0,0,0,512,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",8388608,FwdFFNencoderAttnPlusFFnattnPlusFFnAddBdLlMmhBdLlMmh,Add,0,[],Add,,,,,0,,,,,0,512,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,604,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumXGrad,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumXGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 7168], [7168, 4096], [1, 4096, 4096]]",1,67108864,7168,1,4096,4096,7168,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFdown,"XlaEinsum(a=1x4096x7168,b=7168x4096,eq=BLH;HM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFdowneinsumYGrad,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,7168],DT_BFLOAT16:[7168,4096]","[DT_BFLOAT16:(1,4096,4096)]",240518168576,BwdFFNencoderFFdowneinsumYGrad,Einsum,58720256,[],Einsum,"BLH,HM->BLM","[[1, 4096, 7168], [7168, 4096], [1, 4096, 4096]]",1,67108864,7168,1,4096,4096,7168,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,Bwd-FFN-encoder-FFgate_up,"Mul(a=1x4096x7168,b=1x4096x7168,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateuphgateup2hgate2hup1,VPU,32,Memory,14781,1792,14781,0,0,0,0,0,0,0,0,0,1792,0,0,117440512,"DT_BFLOAT16:[1,4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",29360128,BwdFFNencoderFFgateuphgateup2hgate2hup1,Mul,0,[],Mul,,,,,0,,,,,0,1792,117440512,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,2116,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9863424666801974,7399.702320546648,0.12123672281983626,0.9999597730468442,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumXGrad,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFgate,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFgateeinsumYGrad,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFgateeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumXGrad,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumXGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-FFN-encoder-FFup,"XlaEinsum(a=1x4096x4096,b=4096x7168,eq=BLM;MH->BLH,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdFFNencoderFFupeinsumYGrad,MXU,32,Compute,114720,114720,19004,0,0,0,0,0,0,0,0,114720,14336,0,0,150994944,"DT_BFLOAT16:[1,4096,4096],DT_BFLOAT16:[4096,7168]","[DT_BFLOAT16:(1,4096,7168)]",240518168576,BwdFFNencoderFFupeinsumYGrad,Einsum,58720256,[],Einsum,"BLM,MH->BLH","[[1, 4096, 4096], [4096, 7168], [1, 4096, 7168]]",1,104857600,7168,1,4096,7168,4096,0,114720,150994944,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,30825,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2096.567020362622,1225.810669456067,0.9919712843134074,0.16565009046703608,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Bwd-FFN-encoder-,AllReduce(1x4096x4096->1x4096x4096),BwdFFNencoderAllReduce,VPU,1,ICI/NVLink,223697,0,16892,223697,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,BwdFFNencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1907,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7915796814441,0.0,0.07551237563262758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +19,All reduce results of MLP gradient calculation,AllReduce(1x4096x4096->1x4096x4096),AllreduceresultsofMLPgradientcalculationAllReduceMLP19,VPU,32,ICI/NVLink,223697,0,16892,223697,67108864,67108864,0,0,0,0,0,0,0,0,0,134217728,"DT_BFLOAT16:[1,4096,4096]","[DT_BFLOAT16:(1,4096,4096)]",0,AllreduceresultsofMLPgradientcalculationAllReduceMLP19,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,134217728,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1907,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7915796814441,0.0,0.07551237563262758,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,4223,2048,4223,0,0,0,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.945638645512669,7399.952640303102,0.4849632962349041,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-dl/dy-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy20MatMuldldy,MXU,1,Compute,8192,8192,4224,0,0,0,0,0,0,0,0,0,8192,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy20MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 2048], [1, 4096, 2048], [1, 2048]]",1,8194,32768,2048,1,1,4096,0,8192,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,476,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e1-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde120MatMule1,MXU,1,Compute,8192,8192,4225,0,0,0,0,0,0,0,0,0,8192,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde120MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,8194,32768,4096,1,1,2048,0,8192,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.6285881996155,0.0009689922480620155,0.5156254848918399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +20,softmax-bwd-e2-20,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde220MatMule2,MXU,1,Compute,8192,8192,4225,0,0,0,0,0,0,0,0,0,8192,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde220MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,8194,32768,4096,1,1,2048,0,8192,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.6285881996155,0.0009689922480620155,0.5156254848918399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +21,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1XGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 2048], [1, 4096, 2048]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +22,Bwd-Attention_encoder-Attention_output,"XlaEinsum(a=1x4096x16x128,b=16x128x2048,eq=BLND;NDM->BLM,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[16,128,2048]","[DT_BFLOAT16:(1,4096,2048)]",34359738368,BwdAttentionencoderAttentionoutputMatMulattnOutput2attnAvg2Wo1YGrad,Einsum,8388608,[],Einsum,"BLND,NDM->BLM","[[1, 4096, 16, 128], [16, 128, 2048], [1, 4096, 2048]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,MXU,32,Memory,71791,65568,71791,0,0,0,0,0,0,0,0,65568,8192,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2XGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,18350080,4096,16,4096,128,4096,0,65568,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x4096x16,b=1x4096x16x128,eq=BLSN;BSND->BLND,memory_placements=1_0_0,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,MXU,32,Memory,71791,65568,71791,0,0,0,0,0,0,0,0,65568,8192,0,0,570425344,"DT_BFLOAT16:[1,4096,4096,16],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnAvg2attnWeights2Vallgathered2YGrad,Einsum,0,[],Einsum,"BLSN,BSND->BLND","[[1, 4096, 4096, 1], [1, 4096, 1, 128], [1, 4096, 1, 128]]",16,18350080,4096,16,4096,128,4096,0,65568,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"Pointwise Mul.(x=1x4096x4096x16,memory_placements=1_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,VPU,32,Memory,135136,16384,135136,0,0,0,0,0,0,0,0,0,16384,0,0,1073741824,"DT_BFLOAT16:[1,4096,4096,16]","[DT_BFLOAT16:(1,4096,4096,16)]",268435456,BwdAttentionencoderAttentionSoftmaxQKVSoftmaxBackprop,Pointwise Mul.,0,[],Pointwise Mul.,,,,,0,,,,,0,16384,1073741824,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,19354,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,1.9864096613781672,7399.952640303102,0.12124082405872602,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,MXU,32,Memory,71791,65568,71791,0,0,0,0,0,0,0,0,65568,8192,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2XGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,4096,16,4096,4096,128,0,65568,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Attention_Softmax(Q*K)*V,"XlaEinsum(a=1x4096x16x128,b=1x4096x16x128,eq=BLND;BSND->BLSN,memory_placements=0_0_1,type=DT_BFLOAT16)",BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,MXU,32,Memory,71791,65568,71791,0,0,0,0,0,0,0,0,65568,8192,0,0,570425344,"DT_BFLOAT16:[1,4096,16,128],DT_BFLOAT16:[1,4096,16,128]","[DT_BFLOAT16:(1,4096,4096,16)]",68719476736,BwdAttentionencoderAttentionSoftmaxQKVMatMulattnWeights2Q2Kallgathered2YGrad,Einsum,0,[],Einsum,"BLND,BSND->BLSN","[[1, 4096, 1, 1], [1, 4096, 1, 1], [1, 4096, 4096, 1]]",16,35651584,4096,16,4096,4096,128,0,65568,570425344,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,24498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,957.2157615299967,7399.952640303102,0.4528977796119852,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulQ2xnormallgathered2Wq1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulK2xnormallgathered2Wk1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +23,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1XGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +24,Bwd-Attention_encoder-Q/K/V,"XlaEinsum(a=1x4096x2048,b=2048x16x128,eq=BLM;MND->BLND,memory_placements=0_0_0,type=DT_BFLOAT16)",BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,MXU,32,Compute,16416,16416,5279,0,0,0,0,0,0,0,0,16416,2048,0,0,41943040,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[2048,16,128]","[DT_BFLOAT16:(1,4096,16,128)]",34359738368,BwdAttentionencoderQKVMatMulV2xnormallgathered2Wv1YGrad,Einsum,8388608,[],Einsum,"BLM,MND->BLND","[[1, 4096, 2048], [2048, 16, 128], [1, 4096, 16, 128]]",1,41943040,1024,1,4096,2048,2048,0,16416,41943040,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,4700,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2093.063984405458,2379.5382553606237,0.9903138552669356,0.3215592236973816,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +25,Bwd-Attention_encoder-,AllReduce(1x4096x2048->1x4096x2048),BwdAttentionencoderAllReduce,VPU,1,ICI/NVLink,111849,0,8446,111849,33554432,33554432,0,0,0,0,0,0,0,0,0,67108864,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",0,BwdAttentionencoderAllReduce,AllReduce,0,[],AllReduce,,,,,0,,,,,0,0,67108864,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,558.7890817083747,0.0,0.07551203806869929,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,Softmax,"Softmax(x=1x4096x2048,memory_placements=1_1,type=DT_BFLOAT16)",SoftmaxSoftmaxBackprop,VPU,1,Memory,4223,2048,4223,0,0,0,0,0,0,0,0,0,2048,0,0,33554432,"DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096,2048)]",33554432,SoftmaxSoftmaxBackprop,Softmax,0,[],Softmax,,,,,0,,,,,0,2048,33554432,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,988,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,7.945638645512669,7399.952640303102,0.4849632962349041,0.9999936000409597,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-dl/dy-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLV;BLV->BV,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwddldy26MatMuldldy,MXU,1,Compute,8192,8192,4224,0,0,0,0,0,0,0,0,0,8192,0,0,33558528,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,2048)]",16777216,softmaxbwddldy26MatMuldldy,Einsum,16777216,[],Einsum,"BLV,BLV->BV","[[1, 4096, 2048], [1, 4096, 2048], [1, 2048]]",1,8194,32768,2048,1,1,4096,0,8192,33558528,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,476,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.1629269123077,0.0009689922480620155,0.5155625576908524,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e1-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde126MatMule1,MXU,1,Compute,8192,8192,4225,0,0,0,0,0,0,0,0,0,8192,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde126MatMule1,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,8194,32768,4096,1,1,2048,0,8192,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.6285881996155,0.0009689922480620155,0.5156254848918399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +26,softmax-bwd-e2-26,"XlaEinsum(a=1x4096x2048,b=1x4096x2048,eq=BLN;BLN->BL,memory_placements=0_0_1,type=DT_BFLOAT16)",softmaxbwde226MatMule2,MXU,1,Compute,8192,8192,4225,0,0,0,0,0,0,0,0,0,8192,0,0,33562624,"DT_BFLOAT16:[1,4096,2048],DT_BFLOAT16:[1,4096,2048]","[DT_BFLOAT16:(1,4096)]",16777216,softmaxbwde226MatMule2,Einsum,16777216,[],Einsum,"BLN,BLN->BL","[[1, 4096, 2048], [1, 4096, 2048], [1, 4096]]",1,8194,32768,4096,1,1,2048,0,8192,33562624,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,477,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,2.048,3815.6285881996155,0.0009689922480620155,0.5156254848918399,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +27,WeightUpdateOptimizerStates: Load optimizer states from HBM,"Abs(x=27044872192,type=DT_BFLOAT16)",WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,VPU,1,Memory,13614865,1650688,13614865,0,0,0,0,0,0,0,0,0,1650688,0,0,108179488768,DT_BFLOAT16:[27044872192],[DT_BFLOAT16:(27044872192)],0,WeightUpdateOptimizerStatesLoadoptimizerstatesfromHBMinput,Input,0,[],Abs,,,,,0,,,,,0,1650688,108179488768,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,1949995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,7399.9999265508695,0.0,0.9999999900744418,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 +28,WeightUpdateWriteHBM,"Abs(x=6761218048,type=DT_BFLOAT16)",WeightUpdateWriteHBMoutput,VPU,1,Memory,3403717,412672,3403717,0,0,0,0,0,0,0,0,0,412672,0,0,27044872192,DT_BFLOAT16:[6761218048],[DT_BFLOAT16:(6761218048)],0,WeightUpdateWriteHBMoutput,Output,0,[],Abs,,,,,0,,,,,0,412672,27044872192,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,None,0.0,0.0,487498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,7399.998295980541,0.0,0.9999997697271001,20,100.0,20,100.0,20,100.0,20,100.0,20,100.0,1.0,1.0,1.0,1.0,1.0 diff --git a/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json new file mode 100644 index 0000000..7dfc2e6 --- /dev/null +++ b/neusim/npusim/frontend/tests/assets/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.json @@ -0,0 +1,101 @@ +{ + "total_pp_time_ns": 0, + "total_pp_ici_time_ns": 0, + "total_pp_dcn_time_ns": 0, + "total_execution_time_non_pp_ns": 100908622, + "overlapped_compute_time_non_pp_ns": 25815172, + "compute_only_time_non_pp_ns": 33558396, + "memory_only_time_non_pp_ns": 23262644, + "ici_bound_time_non_pp_ns": 18272410, + "total_execution_time_chip_ns": 100908622, + "overlapped_compute_time_chip_ns": 25815172, + "compute_only_time_chip_ns": 33558396, + "memory_only_time_chip_ns": 23262644, + "ici_bound_time_chip_ns": 18272410, + "bounded_by_pp_chip": false, + "total_execution_time_pod_ns": 100908622, + "compute_only_time_pod_ns": 33558396, + "memory_only_time_pod_ns": 23262644, + "ici_bound_time_pod_ns": 18272410, + "bounded_by_pp_dcn": false, + "total_execution_time_ns": 100908622, + "mem_footprint_GB": 46.296875, + "out_of_memory": false, + "sim_config": { + "PUE": 1.1, + "carbon_intensity_kgCO2_per_kWh": 0.5, + "name": "6p", + "num_sa": 8, + "num_vu": 8, + "num_vu_ports": 8, + "hbm_bw_GBps": 7400.0, + "hbm_latency_ns": 500, + "vmem_size_MB": 256, + "freq_GHz": 2.0, + "sa_dim": 256, + "hbm_size_GB": 192, + "ici_bw_GBps": 300.0, + "dcn_bw_GBps": 12.5, + "pcie_bw_GBps": 32.0, + "ici_latency_ns": 3330, + "dcn_latency_ns": 3700, + "pcie_latency_ns": 400, + "TDP_W": 350.0, + "min_power_W": 1.0, + "avg_power_W": 1.0, + "max_power_W": 331.0, + "HBM_GBps_per_W": 123.5, + "ICI_GBps_per_W": 56.583, + "ICI_topology": "TORUS_3D", + "embodied_carbon_kgCO2": 1384.0, + "use_vu_for_small_matmul": true, + "static_power_W_per_sa": 1.777942858, + "static_power_W_per_vu": 0.1554179582, + "static_power_vmem_W": 37.07309859, + "static_power_ici_W": 3.000278571, + "static_power_hbm_mc_W": 7.10422264, + "static_power_hbm_phy_W": 10.65633396, + "static_power_other_W": 41.27610279, + "dynamic_power_W_per_SA": 31.57742933, + "dynamic_power_W_per_VU": 0.7426048, + "dynamic_power_vmem_W": 28.1028608, + "dynamic_power_ici_W_per_GBps": 0.01262060716, + "dynamic_power_hbm_W_per_GBps": 0.008830769231, + "dynamic_power_other_W": 0.0, + "pg_config": "NoPG", + "enable_dvfs": false, + "model_name": "llama3-8b", + "model_type": "llm", + "global_batch_size": 1, + "num_chips": 2, + "data_parallelism_degree": 1, + "tensor_parallelism_degree": 2, + "pipeline_parallelism_degree": 1, + "num_data_parallel_axes": 0, + "num_tensor_parallel_axes": 3, + "num_pipeline_parallel_axes": 0, + "data_parallel_degree_dcn": 1, + "tensor_parallel_degree_dcn": 1, + "pipeline_parallel_degree_dcn": 1, + "microbatch_size_dcn": 1, + "microbatch_size_ici": 1, + "output_file_path": "/mnt/nvme0n1p1/yuqixue2/neusim/NeuSim/results/raw/llama3-8b/dp1-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/training-v6p.csv", + "input_seqlen": 4096, + "output_seqlen": 512, + "d_model": 4096, + "num_heads": 32, + "num_kv_heads": 8, + "d_head": 128, + "d_ff": 14336, + "num_layers": 32, + "ffn_type": "llama", + "decode_width": 1, + "use_flash_attention": false, + "enable_swap_kv_cache": false, + "max_swap_kv_cache_times_prefill": 2, + "max_swap_kv_cache_times_decode": 16, + "num_pods": 1, + "batch_size_per_pod": 1, + "layers_per_pp_stage": 32 + } +} \ No newline at end of file diff --git a/neusim/npusim/frontend/tests/test_query_results_helper.py b/neusim/npusim/frontend/tests/test_query_results_helper.py new file mode 100644 index 0000000..c6d6861 --- /dev/null +++ b/neusim/npusim/frontend/tests/test_query_results_helper.py @@ -0,0 +1,389 @@ + +import unittest +import os +import shutil +from neusim.npusim.frontend.query_results_helper_lib import ( + set_results_path, + is_model_llm, + is_model_llm_moe, + is_model_dlrm, + is_model_sd, + get_pstr_from_pconfig, + get_pconfig_from_pstr, + get_stats_filepath, + get_op_stats_filepath, + get_stats, + get_all_stats, + get_all_op_stats, + get_optimal_stats_for_max_num_chips, + get_latency_metric_name_and_min_max, + get_throughput_metric_name_and_min_max, + get_energy_eff_metric_name_and_min_max, + get_carbon_eff_metric_name_and_min_max, + get_component_data_from_file, + get_total_execution_time_from_file, + get_num_chips, + get_min_num_chips, + get_min_num_chips, + get_slo_stat, + get_pareto_frontier, +) + +class TestQueryResultsHelper(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Set the results path to the local assets directory for testing + cls.test_assets_path = os.path.join(os.path.dirname(__file__), "assets/raw") + set_results_path(cls.test_assets_path) + + def test_is_model_helpers(self): + self.assertTrue(is_model_llm("llama3-8b")) + self.assertTrue(is_model_llm("deepseekv3-671b")) + self.assertFalse(is_model_llm("dlrm-s")) + + self.assertTrue(is_model_llm_moe("deepseekv2-236b")) + self.assertFalse(is_model_llm_moe("llama3-8b")) + + self.assertTrue(is_model_dlrm("dlrm-s")) + self.assertFalse(is_model_dlrm("llama3-8b")) + + self.assertTrue(is_model_sd("gligen")) + self.assertTrue(is_model_sd("dit-xl")) + self.assertFalse(is_model_sd("llama3-8b")) + + def test_get_pstr_from_pconfig(self): + pconfig = { + "dp": 1, "tp": 1, "pp": 1, + "dp_dcn": 1, "tp_dcn": 1, "pp_dcn": 1, + "bs": 1 + } + pstr = get_pstr_from_pconfig(model="llama3-8b", **pconfig) + self.assertEqual(pstr, "dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1") + + # Test MoE config validation (missing ep/ep_dcn) + with self.assertRaises(AssertionError): + get_pstr_from_pconfig(model="deepseekv3-671b", **pconfig) + + pconfig_moe = pconfig.copy() + pconfig_moe.update({"ep": 1, "ep_dcn": 1}) + pstr_moe = get_pstr_from_pconfig(model="deepseekv3-671b", **pconfig_moe) + self.assertEqual(pstr_moe, "dp1-tp1-pp1-ep1-dpdcn1-tpdcn1-ppdcn1-epdcn1-bs1") + + def test_get_pconfig_from_pstr(self): + pstr = "dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1" + config = get_pconfig_from_pstr(pstr) + # Expected: (dp, tp, pp, ep, dp_dcn, tp_dcn, pp_dcn, ep_dcn, bs) + # For dense, ep=1, ep_dcn=1 + self.assertEqual(config, (1, 1, 1, 1, 1, 1, 1, 1, 1)) + + pstr_moe = "dp1-tp1-pp1-ep2-dpdcn1-tpdcn1-ppdcn1-epdcn1-bs1" + config_moe = get_pconfig_from_pstr(pstr_moe) + self.assertEqual(config_moe, (1, 1, 1, 2, 1, 1, 1, 1, 1)) + + def test_get_stats_filepath(self): + path = get_stats_filepath( + model="llama3-8b", version="5p", workload="inference", + dp=1, tp=1, pp=1, dp_dcn=1, tp_dcn=1, pp_dcn=1, batch_size=1, + prefill_or_decode="decode" + ) + expected_suffix = "llama3-8b/dp1-tp1-pp1-dpdcn1-tpdcn1-ppdcn1-bs1/inference-v5p_decode.json" + self.assertTrue(path.endswith(expected_suffix)) + + def test_get_all_stats(self): + # Test LLM + stats = get_all_stats( + model="llama3-8b", version="5p", workload="inference", + prefill_or_decode="decode", batch_size=1 + ) + # The key is (dp, tp, pp, ep, dp_dcn, pp_dcn, bs) + # Note: ep defaults to 1 if not MoE + key = (1, 1, 1, 1, 1, 1, 1) + self.assertIn(key, stats) + # Check a value from the loaded JSON (assuming sample content) + self.assertIsInstance(stats[key], dict) + + # Test DLRM + stats_dlrm = get_all_stats( + model="dlrm-s", version="5p", workload="inference", + batch_size=1 + ) + key_dlrm = (1, 1, 1, 1, 1, 1, 1) + self.assertIn(key_dlrm, stats_dlrm) + + def test_get_all_op_stats(self): + # Test getting CSV stats + # read_json_with_csv=False returns list[dict] (rows of csv) + op_stats = get_all_op_stats( + model="llama3-8b", version="5p", workload="inference", + prefill_or_decode="decode", batch_size=1 + ) + key = (1, 1, 1, 1, 1, 1, 1) + self.assertIn(key, op_stats) + self.assertIsInstance(op_stats[key], list) + if len(op_stats[key]) > 0: + self.assertIsInstance(op_stats[key][0], dict) + + def test_get_optimal_stats_for_max_num_chips(self): + # Since we only have one data point for llama3-8b in assets, it should validly return that one. + opt_stats = get_optimal_stats_for_max_num_chips( + model="llama3-8b", version="5p", max_num_chips=1024, + workload="inference", prefill_or_decode="decode", + batch_size=1, perf_metric="TPOT_ms_request" + ) + self.assertIsInstance(opt_stats, dict) + # Verify it didn't return empty or fail + + def test_metric_name_helpers(self): + # Latency + name, min_max = get_latency_metric_name_and_min_max("llama3-8b", "inference", "prefill") + self.assertEqual(name, "TTFT_sec") + self.assertEqual(min_max, "min") + + name, min_max = get_latency_metric_name_and_min_max("dlrm-s", "inference") + self.assertEqual(name, "latency_ns") + + # Throughput + name, min_max = get_throughput_metric_name_and_min_max("llama3-8b", "inference") + self.assertEqual(name, "throughput_tokens_per_sec") + self.assertEqual(min_max, "max") + + # Energy/Carbon + name, min_max = get_energy_eff_metric_name_and_min_max("llama3-8b", "inference") + self.assertEqual(name, "avg_power_efficiency_tkn_per_joule") + + name, min_max = get_carbon_eff_metric_name_and_min_max("llama3-8b", "inference") + self.assertEqual(name, "avg_total_carbon_efficiency_tkn_per_kgCO2e") + self.assertEqual(min_max, "max") + + def test_training_workload(self): + # Test metric names for training + name, min_max = get_latency_metric_name_and_min_max("llama3-8b", "training") + self.assertEqual(name, "total_execution_time_ns") + + name, min_max = get_energy_eff_metric_name_and_min_max("llama3-8b", "training") + self.assertEqual(name, "avg_power_efficiency_iteration_per_joule") + + name, min_max = get_throughput_metric_name_and_min_max("llama3-8b", "training") + self.assertEqual(name, "total_execution_time_ns") + self.assertEqual(min_max, "min") + + # Test retrieving training stats + stats = get_all_stats( + model="llama3-8b", version="5p", workload="training", + batch_size=1 + ) + self.assertIn((1, 1, 1, 1, 1, 1, 1), stats) + # Real value is 663101427 ns from training-v5p.json + self.assertEqual(stats[(1, 1, 1, 1, 1, 1, 1)]["total_execution_time_ns"], 663101427) + + def test_sd_model(self): + # Test metric names for SD model (gligen) + name, min_max = get_latency_metric_name_and_min_max("gligen", "inference") + self.assertEqual(name, "latency_step_sec") + + # Test retrieving SD stats + stats = get_all_stats( + model="gligen", version="5p", workload="inference", + batch_size=1 + ) + self.assertIn((1, 1, 1, 1, 1, 1, 1), stats) + self.assertAlmostEqual(stats[(1, 1, 1, 1, 1, 1, 1)]["latency_step_sec"], 0.00010261685) + + def test_get_min_num_chips(self): + # Minimum chips for standard inference asset (1 chip) + min_chips = get_min_num_chips( + model="llama3-8b", version="5p", workload="inference", + prefill_or_decode="decode", batch_size=1 + ) + self.assertEqual(min_chips, 1) + + def test_get_component_data(self): + # Use an existing csv asset + csv_path = get_op_stats_filepath( + model="llama3-8b", version="5p", workload="inference", prefill_or_decode="decode", + dp=1, tp=1, pp=1, dp_dcn=1, tp_dcn=1, pp_dcn=1, batch_size=1, + results_path=self.test_assets_path + ) + + # From inference-v5p_decode.csv (calculated sum) + exec_time = get_component_data_from_file(csv_path, "Execution time") + self.assertEqual(exec_time, 2649178112) + + # From inference-v5p_decode.csv (calculated sum) + compute_time = get_component_data_from_file(csv_path, "Compute time") + self.assertEqual(compute_time, 0) + + def test_get_total_execution_time(self): + csv_path = get_op_stats_filepath( + model="llama3-8b", version="5p", workload="inference", prefill_or_decode="decode", + dp=1, tp=1, pp=1, dp_dcn=1, tp_dcn=1, pp_dcn=1, batch_size=1, + results_path=self.test_assets_path + ) + total_time = get_total_execution_time_from_file(csv_path) + self.assertIsInstance(total_time, int) + self.assertGreater(total_time, 0) + + def test_get_pareto_frontier(self): + # Create dummy stats + s1 = {"cost": 10, "lat": 100} + s2 = {"cost": 20, "lat": 50} # faster, more expensive + s3 = {"cost": 30, "lat": 150} # dominated by s1 and s2 (worse cost than s2/s1, worse lat than s2/s1) + + cmp_cost = lambda a, b: a["cost"] < b["cost"] + cmp_lat = lambda a, b: a["lat"] < b["lat"] + + frontier = get_pareto_frontier([s1, s2, s3], [cmp_cost, cmp_lat]) + self.assertIn(s1, frontier) + self.assertIn(s2, frontier) + self.assertNotIn(s3, frontier) + + def test_get_num_chips_with_args(self): + # Standard calculation + n = get_num_chips( + "llama3-8b", dp=2, tp=2, pp=1, dp_dcn=1, tp_dcn=1, pp_dcn=1, ep=1, ep_dcn=1 + ) + self.assertEqual(n, 4) + + # DLRM case (dp must eq tp, pp=1) + n_dlrm = get_num_chips( + "dlrm-s", dp=4, tp=4, pp=1, dp_dcn=2, tp_dcn=1, pp_dcn=1 + ) + # For DLRM: num_chips = dp * dp_dcn = 4 * 2 = 8 + self.assertEqual(n_dlrm, 8) + + def test_get_num_chips_with_pstr(self): + # Test with pstr only + n = get_num_chips( + "llama3-8b", pstr="dp2-tp2-pp1-dpdcn1-tpdcn1-ppdcn1-bs1" + ) + self.assertEqual(n, 4) + + def test_get_slo_stat(self): + # Should return stats for the config with min chips (1) + slo_stat = get_slo_stat( + model="llama3-8b", workload="inference", prefill_or_decode="decode", version="5p" + ) + self.assertEqual(slo_stat["sim_config"]["num_chips"], 1) + + def test_optimal_stats_max_metric(self): + opt_stats = get_optimal_stats_for_max_num_chips( + model="gligen", version="5p", max_num_chips=1024, + workload="inference", prefill_or_decode="", + batch_size=1, perf_metric="throughput_step_per_sec_per_request", + min_or_max_metric="max" + ) + self.assertIsNotNone(opt_stats) + # Check against real expected value from gligen/dp1-tp1.../inference-v5p.json + self.assertAlmostEqual(opt_stats["throughput_step_per_sec_per_request"], 243.62470685857147) + + def test_optimal_stats_callable_and_error(self): + # Test custom metric function + # Mocking all_stats + all_stats = { + "conf1": {"custom_score": 10}, + "conf2": {"custom_score": 20} + } + res = get_optimal_stats_for_max_num_chips( + "llama3-8b", "5p", perf_metric=lambda x: x["custom_score"], + min_or_max_metric="max", all_stats=all_stats + ) + self.assertEqual(res["custom_score"], 20) + + # Test invalid min_or_max + with self.assertRaises(ValueError): + get_optimal_stats_for_max_num_chips( + "llama3-8b", "5p", perf_metric="custom_score", + min_or_max_metric="average", all_stats=all_stats + ) + + def test_get_component_data_bounded_by(self): + # Test with a csv with specific bounded-by values to hit branches + temp_csv = os.path.join(self.test_assets_path, "temp_components.csv") + with open(temp_csv, "w") as f: + f.write("Execution time,Compute time,Memory time,ICI/NVLink time,Bounded-by,Count\n") + f.write("100,50,60,20,Memory,1\n") # Bounded by Memory + f.write("100,60,50,20,Compute,1\n") # Bounded by Compute + f.write("100,20,20,80,ICI/NVLink,1\n") # Bounded by ICI + try: + # Test Memory bounded logic + # val = (Memory - max(Compute, ICI)) * Count = (60 - 50) * 1 = 10 + val_mem = get_component_data_from_file(temp_csv, "Memory time") + self.assertEqual(val_mem, 10) + + # Test ICI bounded logic + # val = abs(ICI - Compute) * Count = abs(80 - 20) * 1 = 60 + val_ici = get_component_data_from_file(temp_csv, "ICI/NVLink time") + self.assertEqual(val_ici, 60) + + # Test Invalid Key + with self.assertRaises(ValueError): + get_component_data_from_file(temp_csv, "Invalid time") + finally: + if os.path.exists(temp_csv): + os.remove(temp_csv) + + def test_errors(self): + # Invalid pstr (enough parts to avoid index error, but wrong count) + with self.assertRaises(ValueError): + get_pconfig_from_pstr("dp1-tp1-pp1-bs1") + + with self.assertRaises(ValueError): + get_stats_filepath("llama3-8b", "5p", "invalid_workload", 1, 1, 1, 1, 1, 1, 1) + + def test_get_all_stats_filtering(self): + # Using version '6p' which we populated with: + # 1. dp1-tp1...bs1 (1 chip) + # 2. dp1-tp1...bs32 (1 chip) + # 3. dp1-tp2...bs1 (2 chips) + + # Keys: (dp, tp, pp, dp_dcn, tp_dcn, pp_dcn, batch_size) + key_bs1_1chip = (1, 1, 1, 1, 1, 1, 1) + key_bs32_1chip = (1, 1, 1, 1, 1, 1, 32) + key_bs1_2chip = (1, 2, 1, 1, 1, 1, 1) + + # 1. No filter (just model/ver/workload/prefill_or_decode) + stats_all = get_all_stats("llama3-8b", "6p", "inference", prefill_or_decode="decode") + self.assertIn(key_bs1_1chip, stats_all) + self.assertIn(key_bs32_1chip, stats_all) + self.assertIn(key_bs1_2chip, stats_all) + + # 2. Filter by batch_size=1 + stats_bs1 = get_all_stats("llama3-8b", "6p", "inference", batch_size=1, prefill_or_decode="decode") + self.assertIn(key_bs1_1chip, stats_bs1) + self.assertIn(key_bs1_2chip, stats_bs1) + self.assertNotIn(key_bs32_1chip, stats_bs1) + + # 3. Filter by batch_size=32 + stats_bs32 = get_all_stats("llama3-8b", "6p", "inference", batch_size=32, prefill_or_decode="decode") + self.assertIn(key_bs32_1chip, stats_bs32) + self.assertNotIn(key_bs1_1chip, stats_bs32) + + # 4. Filter by max_num_chips=1 + stats_max1 = get_all_stats("llama3-8b", "6p", "inference", max_num_chips=1, prefill_or_decode="decode") + self.assertIn(key_bs1_1chip, stats_max1) + self.assertIn(key_bs32_1chip, stats_max1) + self.assertNotIn(key_bs1_2chip, stats_max1) # 2 chips > 1 + + # 5. Filter by max_num_chips=1 AND batch_size=1 + stats_combo = get_all_stats("llama3-8b", "6p", "inference", batch_size=1, max_num_chips=1, prefill_or_decode="decode") + self.assertIn(key_bs1_1chip, stats_combo) + self.assertNotIn(key_bs1_2chip, stats_combo) + self.assertNotIn(key_bs32_1chip, stats_combo) + + def test_read_json_with_csv(self): + # Test get_all_op_stats with read_json_with_csv=True + # Should return tuple match + from neusim.npusim.frontend.query_results_helper_lib import get_all_op_stats + + stats = get_all_op_stats( + "llama3-8b", "5p", "inference", "decode", batch_size=1, + read_json_with_csv=True + ) + key = (1, 1, 1, 1, 1, 1, 1) + self.assertIn(key, stats) + val = stats[key] + self.assertIsInstance(val, tuple) + self.assertEqual(len(val), 2) + # (json_dict, csv_list) + self.assertIsInstance(val[0], dict) + self.assertIsInstance(val[1], list) From 9d3cdf0c9da4adb21996c60756a70a3f6e3a8bec Mon Sep 17 00:00:00 2001 From: Yuqi Xue Date: Thu, 29 Jan 2026 20:31:38 -0600 Subject: [PATCH 12/12] change CI badge links to point to main branch in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 83fdd97..7496f40 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # NeuSim: An Open-source Simulator Framework for NPUs -[![CI Lint Status](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml/badge.svg?branch=add_unit_tests)](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml) [![CI Test Status](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml/badge.svg?branch=add_unit_tests)](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml) +[![CI Lint Status](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml/badge.svg?branch=main)](https://github.com/platformxlab/NeuSim/actions/workflows/lint.yml) [![CI Test Status](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/platformxlab/NeuSim/actions/workflows/test.yml) NeuSim is a simulator framework for modeling the performance and power behaviors of neural processing units (NPUs) when running machine learning workloads.