Created
November 21, 2024 18:48
-
-
Save tenpercent/22bccb03230eeb2b8d0767fed785c82a to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 64, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=256, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=128, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=128, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=64, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v1', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=128, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=128, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=4, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=64, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=64, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=2, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(2,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=64, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=64, m_per_block=16, n_per_block=16, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 8, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 8, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 4), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=256, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=32, k_per_block=512, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=64, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=1, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=16, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=128, m_per_block=32, n_per_block=128, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=16, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=1, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='F8', b_element_dtype='F8', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=32, n_per_block=256, k_per_block=128, a_k1=16, b_k1=16, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=1, n_xdl_per_wave=2, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=16, a_block_transfer_dst_scalar_per_vector_ak1=16, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=16, b_block_transfer_dst_scalar_per_vector_bk1=16, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(8,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Interwave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v2', a_compute_dtype='F8', b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=4, b_k1=4, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=4, a_block_transfer_dst_scalar_per_vector_ak1=4, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=4, b_block_transfer_dst_scalar_per_vector_bk1=4, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=2, b_k1=2, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(16, 16, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=2, a_block_transfer_dst_scalar_per_vector_ak1=2, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(16, 16, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=2, b_block_transfer_dst_scalar_per_vector_bk1=2, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v4', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=256, n_per_block=256, k_per_block=32, a_k1=8, b_k1=8, m_per_xdl=32, n_per_xdl=32, m_xdl_per_wave=4, n_xdl_per_wave=4, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(4, 64, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(4, 64, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v5', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::KPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MKPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NKPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MNKPadding', block_size=256, m_per_block=224, n_per_block=256, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=7, n_xdl_per_wave=8, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=1, c_shuffle_n_xdl_per_wave_per_shuffle=2, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 16, 1, 16), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::Default', block_size=256, m_per_block=256, n_per_block=224, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::MPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle_n_xdl_per_wave_per_shuffle=1, c_shuffle_block_transfer_cluster_lengths_m_block_m_per_block_n_block_n_per_block=(1, 32, 1, 8), c_shuffle_block_transfer_scalar_per_vector_n_per_block=(4,), block_gemm_pipeline_scheduler='BlockGemmPipelineScheduler::Intrawave', block_gemm_pipeline_version='BlockGemmPipelineVersion::v3', a_compute_dtype=None, b_compute_dtype=None), CKBatchedGemmOperation(a_layout='Row', b_layout='Col', ds_layouts=(), c_layout='Row', a_element_dtype='BF16', b_element_dtype='BF16', ds_element_dtypes=(), c_element_dtype='BF16', acc_dtype='F32', c_shuffle_dtype='BF16', a_elementwise_op='PassThrough', b_elementwise_op='PassThrough', c_elementwise_op='PassThrough', gemm_specialization='GemmSpecialization::NPadding', block_size=256, m_per_block=256, n_per_block=224, k_per_block=64, a_k1=8, b_k1=8, m_per_xdl=16, n_per_xdl=16, m_xdl_per_wave=8, n_xdl_per_wave=7, a_block_transfer_thread_cluster_lengths_ak0_m_ak1=(8, 32, 1), a_block_transfer_thread_cluster_arrange_order=(1, 0, 2), a_block_transfer_src_access_order=(1, 0, 2), a_block_transfer_src_vector_dim=2, a_block_transfer_src_scalar_per_vector=8, a_block_transfer_dst_scalar_per_vector_ak1=8, a_block_lds_extra_m=0, b_block_transfer_thread_cluster_lengths_bk0_n_bk1=(8, 32, 1), b_block_transfer_thread_cluster_arrange_order=(1, 0, 2), b_block_transfer_src_access_order=(1, 0, 2), b_block_transfer_src_vector_dim=2, b_block_transfer_src_scalar_per_vector=8, b_block_transfer_dst_scalar_per_vector_bk1=8, b_block_lds_extra_n=0, c_shuffle_m_xdl_per_wave_per_shuffle=2, c_shuffle |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment