Skip to content

Instantly share code, notes, and snippets.

@banach-space
Last active July 22, 2025 13:27
Show Gist options
  • Select an option

  • Save banach-space/c96ff54fe2d08b1597602ca24dc6ba66 to your computer and use it in GitHub Desktop.

Select an option

Save banach-space/c96ff54fe2d08b1597602ca24dc6ba66 to your computer and use it in GitHub Desktop.
Tile + Fuse + SVE
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
func.func @pack_sve_prod() {
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
%vscale = vector.vscale
%c8_vscale = arith.muli %vscale, %c8 : index
%0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
%first = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
%2 = iree_tensor_ext.dispatch.tensor.load %first, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
%3 = tensor.empty() : tensor<8x32x8x1xf32>
%4 = tensor.empty() : tensor<64x32xf32>
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) attrs = {lowering_config = #config1} {
^bb0(%in: f32, %out: f32):
%7 = arith.addf %in, %in : f32
linalg.yield %7 : f32
} -> tensor<64x32xf32>
%pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 {lowering_config = #config} : tensor<64x32xf32> -> tensor<8x32x8x1xf32>
iree_tensor_ext.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [8, 32, 8, 1], strides = [1, 1, 1, 1] : tensor<8x32x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
return
}
// $ tools/iree-opt fixed.mlir --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=vector_common_parallel}, cse))"
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0) -> (d0 floordiv 8)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
module {
func.func @pack_sve_prod() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
%2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
%3 = tensor.empty() : tensor<8x32x8x1xf32>
%4 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (8, 1) shared_outs(%arg2 = %3) -> (tensor<8x32x8x1xf32>) {
%extracted_slice = tensor.extract_slice %2[%arg0, %arg1] [8, 1] [1, 1] : tensor<64x32xf32> to tensor<8x1xf32>
%5 = tensor.empty() : tensor<8x1xf32>
%6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<8x1xf32>) outs(%5 : tensor<8x1xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %out: f32):
%8 = arith.addf %in, %in : f32
linalg.yield %8 : f32
} -> tensor<8x1xf32>
%7 = affine.apply #map1(%arg0)
%extracted_slice_0 = tensor.extract_slice %arg2[%7, %arg1, 0, 0] [1, 1, 8, 1] [1, 1, 1, 1] : tensor<8x32x8x1xf32> to tensor<1x1x8x1xf32>
%pack = linalg.pack %6 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %extracted_slice_0 {lowering_config = #config1} : tensor<8x1xf32> -> tensor<1x1x8x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %pack into %arg2[%7, %arg1, 0, 0] [1, 1, 8, 1] [1, 1, 1, 1] : tensor<1x1x8x1xf32> into tensor<8x32x8x1xf32>
}
}
iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 32, 8, 1], strides = [1, 1, 1, 1] : tensor<8x32x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
return
}
}
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[8], 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
func.func @pack_sve_prod() {
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
%vscale = vector.vscale
%c8_vscale = arith.muli %vscale, %c8 : index
%0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
%first = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
%2 = iree_tensor_ext.dispatch.tensor.load %first, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
%3 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>
%4 = tensor.empty() : tensor<64x32xf32>
%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) attrs = {lowering_config = #config1} {
^bb0(%in: f32, %out: f32):
%7 = arith.addf %in, %in : f32
linalg.yield %7 : f32
} -> tensor<64x32xf32>
%pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %3 {lowering_config = #config} : tensor<64x32xf32> -> tensor<?x32x?x1xf32>
iree_tensor_ext.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [%0, 32, %c8_vscale, 1], strides = [1, 1, 1, 1] : tensor<?x32x?x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
return
}
// $ tools/iree-opt scalable.mlir --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=vector_common_parallel}, cse))"
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[8], 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#map = affine_map<()[s0] -> (64 ceildiv s0)>
#map1 = affine_map<(d0)[s0] -> (-d0 + 64, s0)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
module {
func.func @pack_sve_prod() {
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
%vscale = vector.vscale
%c8_vscale = arith.muli %vscale, %c8 : index
%0 = affine.apply #map()[%c8_vscale]
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
%3 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
%4 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>
%5 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (%c8_vscale, 1) shared_outs(%arg2 = %4) -> (tensor<?x32x?x1xf32>) {
%6 = affine.min #map1(%arg0)[%c8_vscale]
%extracted_slice = tensor.extract_slice %3[%arg0, %arg1] [%6, 1] [1, 1] : tensor<64x32xf32> to tensor<?x1xf32>
%7 = tensor.empty(%6) : tensor<?x1xf32>
%8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x1xf32>) outs(%7 : tensor<?x1xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %out: f32):
%9 = arith.addf %in, %in : f32
linalg.yield %9 : f32
} -> tensor<?x1xf32>
%extracted_slice_0 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [%0, 1, %c8_vscale, 1] [1, 1, 1, 1] : tensor<?x32x?x1xf32> to tensor<?x1x?x1xf32>
%pack = linalg.pack %8 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %extracted_slice_0 {lowering_config = #config1} : tensor<?x1xf32> -> tensor<?x1x?x1xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %pack into %arg2[%arg0, %arg1, 0, 0] [%0, 1, %c8_vscale, 1] [1, 1, 1, 1] : tensor<?x1x?x1xf32> into tensor<?x32x?x1xf32>
}
}
iree_tensor_ext.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [%0, 32, %c8_vscale, 1], strides = [1, 1, 1, 1] : tensor<?x32x?x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment