banach-space/fixed.mlir

## fixed.mlir
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
func.func @pack_sve_prod() {
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %vscale = vector.vscale
    %c8_vscale = arith.muli %vscale, %c8 : index
    %0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
    %first = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
    %2 = iree_tensor_ext.dispatch.tensor.load %first, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
    %3 = tensor.empty() : tensor<8x32x8x1xf32>
    %4 = tensor.empty() : tensor<64x32xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) attrs =  {lowering_config = #config1} {
    ^bb0(%in: f32, %out: f32):
      %7 = arith.addf %in, %in : f32
      linalg.yield %7 : f32
    } -> tensor<64x32xf32>
    %pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 {lowering_config = #config} : tensor<64x32xf32> -> tensor<8x32x8x1xf32>
    iree_tensor_ext.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [8, 32, 8, 1], strides = [1, 1, 1, 1] : tensor<8x32x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
    return
  }

## fixed_out.mlir
// $ tools/iree-opt fixed.mlir --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=vector_common_parallel}, cse))"

#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0) -> (d0 floordiv 8)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
module {
  func.func @pack_sve_prod() {
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
    %2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
    %3 = tensor.empty() : tensor<8x32x8x1xf32>
    %4 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (8, 1) shared_outs(%arg2 = %3) -> (tensor<8x32x8x1xf32>) {
      %extracted_slice = tensor.extract_slice %2[%arg0, %arg1] [8, 1] [1, 1] : tensor<64x32xf32> to tensor<8x1xf32>
      %5 = tensor.empty() : tensor<8x1xf32>
      %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<8x1xf32>) outs(%5 : tensor<8x1xf32>) attrs =  {lowering_config = #config} {
      ^bb0(%in: f32, %out: f32):
        %8 = arith.addf %in, %in : f32
        linalg.yield %8 : f32
      } -> tensor<8x1xf32>
      %7 = affine.apply #map1(%arg0)
      %extracted_slice_0 = tensor.extract_slice %arg2[%7, %arg1, 0, 0] [1, 1, 8, 1] [1, 1, 1, 1] : tensor<8x32x8x1xf32> to tensor<1x1x8x1xf32>
      %pack = linalg.pack %6 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %extracted_slice_0 {lowering_config = #config1} : tensor<8x1xf32> -> tensor<1x1x8x1xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %pack into %arg2[%7, %arg1, 0, 0] [1, 1, 8, 1] [1, 1, 1, 1] : tensor<1x1x8x1xf32> into tensor<8x32x8x1xf32>
      }
    }
    iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 32, 8, 1], strides = [1, 1, 1, 1] : tensor<8x32x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
    return
  }
}

## scalable.mlir
#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[8], 1], vector_reduction = [0, 0]>
#map = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
func.func @pack_sve_prod() {
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %vscale = vector.vscale
    %c8_vscale = arith.muli %vscale, %c8 : index
    %0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
    %first = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
    %2 = iree_tensor_ext.dispatch.tensor.load %first, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
    %3 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>
    %4 = tensor.empty() : tensor<64x32xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) attrs =  {lowering_config = #config1} {
    ^bb0(%in: f32, %out: f32):
      %7 = arith.addf %in, %in : f32
      linalg.yield %7 : f32
    } -> tensor<64x32xf32>
    %pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %3 {lowering_config = #config} : tensor<64x32xf32> -> tensor<?x32x?x1xf32>
    iree_tensor_ext.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [%0, 32, %c8_vscale, 1], strides = [1, 1, 1, 1] : tensor<?x32x?x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
    return
  }

## scalable_out.mlir
// $ tools/iree-opt scalable.mlir --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=vector_common_parallel}, cse))"

#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[8], 1], vector_reduction = [0, 0]>
#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
#map = affine_map<()[s0] -> (64 ceildiv s0)>
#map1 = affine_map<(d0)[s0] -> (-d0 + 64, s0)>
#map2 = affine_map<(d0, d1) -> (d0, d1)>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
module {
  func.func @pack_sve_prod() {
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %vscale = vector.vscale
    %c8_vscale = arith.muli %vscale, %c8 : index
    %0 = affine.apply #map()[%c8_vscale]
    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
    %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
    %3 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
    %4 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>
    %5 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (%c8_vscale, 1) shared_outs(%arg2 = %4) -> (tensor<?x32x?x1xf32>) {
      %6 = affine.min #map1(%arg0)[%c8_vscale]
      %extracted_slice = tensor.extract_slice %3[%arg0, %arg1] [%6, 1] [1, 1] : tensor<64x32xf32> to tensor<?x1xf32>
      %7 = tensor.empty(%6) : tensor<?x1xf32>
      %8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x1xf32>) outs(%7 : tensor<?x1xf32>) attrs =  {lowering_config = #config} {
      ^bb0(%in: f32, %out: f32):
        %9 = arith.addf %in, %in : f32
        linalg.yield %9 : f32
      } -> tensor<?x1xf32>
      %extracted_slice_0 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [%0, 1, %c8_vscale, 1] [1, 1, 1, 1] : tensor<?x32x?x1xf32> to tensor<?x1x?x1xf32>
      %pack = linalg.pack %8 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %extracted_slice_0 {lowering_config = #config1} : tensor<?x1xf32> -> tensor<?x1x?x1xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %pack into %arg2[%arg0, %arg1, 0, 0] [%0, 1, %c8_vscale, 1] [1, 1, 1, 1] : tensor<?x1x?x1xf32> into tensor<?x32x?x1xf32>
      }
    }
    iree_tensor_ext.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [%0, 32, %c8_vscale, 1], strides = [1, 1, 1, 1] : tensor<?x32x?x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
    return
  }
}
	#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
	#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
	#map = affine_map<(d0, d1) -> (d0, d1)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
	func.func @pack_sve_prod() {
	%c0 = arith.constant 0 : index
	%c8 = arith.constant 8 : index
	%vscale = vector.vscale
	%c8_vscale = arith.muli %vscale, %c8 : index
	%0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
	%first = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
	%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
	%2 = iree_tensor_ext.dispatch.tensor.load %first, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
	%3 = tensor.empty() : tensor<8x32x8x1xf32>
	%4 = tensor.empty() : tensor<64x32xf32>
	%5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) attrs = {lowering_config = #config1} {
	^bb0(%in: f32, %out: f32):
	%7 = arith.addf %in, %in : f32
	linalg.yield %7 : f32
	} -> tensor<64x32xf32>
	%pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 {lowering_config = #config} : tensor<64x32xf32> -> tensor<8x32x8x1xf32>
	iree_tensor_ext.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [8, 32, 8, 1], strides = [1, 1, 1, 1] : tensor<8x32x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
	return
	}
	// $ tools/iree-opt fixed.mlir --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=vector_common_parallel}, cse))"

	#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [8, 1], vector_reduction = [0, 0]>
	#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
	#map = affine_map<(d0, d1) -> (d0, d1)>
	#map1 = affine_map<(d0) -> (d0 floordiv 8)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
	module {
	func.func @pack_sve_prod() {
	%c0 = arith.constant 0 : index
	%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
	%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
	%2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
	%3 = tensor.empty() : tensor<8x32x8x1xf32>
	%4 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (8, 1) shared_outs(%arg2 = %3) -> (tensor<8x32x8x1xf32>) {
	%extracted_slice = tensor.extract_slice %2[%arg0, %arg1] [8, 1] [1, 1] : tensor<64x32xf32> to tensor<8x1xf32>
	%5 = tensor.empty() : tensor<8x1xf32>
	%6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<8x1xf32>) outs(%5 : tensor<8x1xf32>) attrs = {lowering_config = #config} {
	^bb0(%in: f32, %out: f32):
	%8 = arith.addf %in, %in : f32
	linalg.yield %8 : f32
	} -> tensor<8x1xf32>
	%7 = affine.apply #map1(%arg0)
	%extracted_slice_0 = tensor.extract_slice %arg2[%7, %arg1, 0, 0] [1, 1, 8, 1] [1, 1, 1, 1] : tensor<8x32x8x1xf32> to tensor<1x1x8x1xf32>
	%pack = linalg.pack %6 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %extracted_slice_0 {lowering_config = #config1} : tensor<8x1xf32> -> tensor<1x1x8x1xf32>
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %pack into %arg2[%7, %arg1, 0, 0] [1, 1, 8, 1] [1, 1, 1, 1] : tensor<1x1x8x1xf32> into tensor<8x32x8x1xf32>
	}
	}
	iree_tensor_ext.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 32, 8, 1], strides = [1, 1, 1, 1] : tensor<8x32x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<8x32x8x1xf32>>
	return
	}
	}
	// $ tools/iree-opt scalable.mlir --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-root-and-fuse-producer-consumer{tiling-level=vector_common_parallel}, cse))"

	#config = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [[8], 1], vector_reduction = [0, 0]>
	#config1 = #iree_cpu.lowering_config<distribution = [0, 0], vector_common_parallel = [1, 1], vector_reduction = [0, 0]>
	#map = affine_map<()[s0] -> (64 ceildiv s0)>
	#map1 = affine_map<(d0)[s0] -> (-d0 + 64, s0)>
	#map2 = affine_map<(d0, d1) -> (d0, d1)>
	#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
	module {
	func.func @pack_sve_prod() {
	%c0 = arith.constant 0 : index
	%c8 = arith.constant 8 : index
	%vscale = vector.vscale
	%c8_vscale = arith.muli %vscale, %c8 : index
	%0 = affine.apply #map()[%c8_vscale]
	%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>>
	%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
	%3 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x32xf32>> -> tensor<64x32xf32>
	%4 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>
	%5 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (%c8_vscale, 1) shared_outs(%arg2 = %4) -> (tensor<?x32x?x1xf32>) {
	%6 = affine.min #map1(%arg0)[%c8_vscale]
	%extracted_slice = tensor.extract_slice %3[%arg0, %arg1] [%6, 1] [1, 1] : tensor<64x32xf32> to tensor<?x1xf32>
	%7 = tensor.empty(%6) : tensor<?x1xf32>
	%8 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice : tensor<?x1xf32>) outs(%7 : tensor<?x1xf32>) attrs = {lowering_config = #config} {
	^bb0(%in: f32, %out: f32):
	%9 = arith.addf %in, %in : f32
	linalg.yield %9 : f32
	} -> tensor<?x1xf32>
	%extracted_slice_0 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [%0, 1, %c8_vscale, 1] [1, 1, 1, 1] : tensor<?x32x?x1xf32> to tensor<?x1x?x1xf32>
	%pack = linalg.pack %8 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %extracted_slice_0 {lowering_config = #config1} : tensor<?x1xf32> -> tensor<?x1x?x1xf32>
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %pack into %arg2[%arg0, %arg1, 0, 0] [%0, 1, %c8_vscale, 1] [1, 1, 1, 1] : tensor<?x1x?x1xf32> into tensor<?x32x?x1xf32>
	}
	}
	iree_tensor_ext.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [%0, 32, %c8_vscale, 1], strides = [1, 1, 1, 1] : tensor<?x32x?x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x32x?x1xf32>>{%0, %c8_vscale}
	return
	}
	}