llandsmeer/build.sh

## build.sh
set -ex

. ../env/bin/activate

python3 jax_test.py

iree-opt ./jax.stablehlo.mlir \
    --iree-stablehlo-input-transformation-pipeline  \
    --convert-scf-to-cf \
    > jax.linalg.mlir

streamhls-opt ./jax.linalg.mlir \
    -streamhls-host-pipeline \
    > host.mlir

streamhls-translate ./host.mlir \
    -emit-vivado-hls \
    -vitis-hls-weights-dir=data \
    -vitis-hls-is-host=true \
    -o host_tb.cpp

# put top-func=main
streamhls-opt jax.linalg.mlir \
    -streamhls-kernel-pipeline="top-func=main \
      graph-file=graph\
      report-file=report\
      optimize-schedule=1\
      parallelize-nodes=1\
      combined-optimization=0\
      board-dsps=1024 \
      tiling-limit=16 \
      time-limit-minutes=10 \
      bufferize-func-args=0 \
      optimize-conv-reuse=0 \
      minimize-on-chip-buffers=0 \
      debug-point=14" > kernel.mlir


streamhls-translate \
    kernel.mlir \
    -emit-vivado-hls \
    -o kernel.cpp

## jax_test.py

from jax import numpy as jnp
from jax import jit
import jax

def f(rs):
    o = jax.vmap(lambda r:
                    jax.lax.scan(lambda x, _:
                                 (jax.lax.select(
                                         x > 0.5,
                                         -x,
                                         r*x*(1-x)),
                                  x)
                                 , 0.5, length=20, unroll=True)[0]
                    )(rs)
    return o
    # return jax.lax.select(x > 3, x - 5, x + 5)
    r# eturn x + 1

x = jnp.ones((4,))

mlir = jax.jit(f).lower(x).as_text()

with open('jax.stablehlo.mlir', 'w') as f:
    print(mlir.replace('@main(', '@forward('), file=f)
print(mlir)

## z_jax.linalg.mlir
#map = affine_map<(d0) -> (d0)>
module @jit_f {
  func.func public @forward(%arg0: tensor<4xf32>) -> tensor<4xf32> {
    %cst = arith.constant -5.000000e-01 : f32
    %false = arith.constant false
    %cst_0 = arith.constant 1.000000e+00 : f32
    %cst_1 = arith.constant 5.000000e-01 : f32
    %0 = tensor.empty() : tensor<4xi1>
    %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<4xi1>) {
    ^bb0(%out: i1):
      linalg.yield %false : i1
    } -> tensor<4xi1>
    %2 = tensor.empty() : tensor<4xf32>
    %3 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%2 : tensor<4xf32>) {
    ^bb0(%out: f32):
      linalg.yield %cst : f32
    } -> tensor<4xf32>
    %4 = tensor.empty() : tensor<4xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg0 : tensor<4xf32>) outs(%4 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.mulf %in, %cst_1 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %6 = tensor.empty() : tensor<4xf32>
    %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<4xf32>) outs(%6 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.mulf %in, %cst_1 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %8 = tensor.empty() : tensor<4xf32>
    %9 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%1, %3, %7 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%8 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %10 = tensor.empty() : tensor<4xi1>
    %11 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%9 : tensor<4xf32>) outs(%10 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %12 = tensor.empty() : tensor<4xf32>
    %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%9 : tensor<4xf32>) outs(%12 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %14 = tensor.empty() : tensor<4xf32>
    %15 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %9 : tensor<4xf32>, tensor<4xf32>) outs(%14 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %16 = tensor.empty() : tensor<4xf32>
    %17 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%9 : tensor<4xf32>) outs(%16 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %18 = tensor.empty() : tensor<4xf32>
    %19 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%15, %17 : tensor<4xf32>, tensor<4xf32>) outs(%18 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %20 = tensor.empty() : tensor<4xf32>
    %21 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%11, %13, %19 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%20 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %22 = tensor.empty() : tensor<4xi1>
    %23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%21 : tensor<4xf32>) outs(%22 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %24 = tensor.empty() : tensor<4xf32>
    %25 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%21 : tensor<4xf32>) outs(%24 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %26 = tensor.empty() : tensor<4xf32>
    %27 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %21 : tensor<4xf32>, tensor<4xf32>) outs(%26 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %28 = tensor.empty() : tensor<4xf32>
    %29 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%21 : tensor<4xf32>) outs(%28 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %30 = tensor.empty() : tensor<4xf32>
    %31 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%27, %29 : tensor<4xf32>, tensor<4xf32>) outs(%30 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %32 = tensor.empty() : tensor<4xf32>
    %33 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%23, %25, %31 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%32 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %34 = tensor.empty() : tensor<4xi1>
    %35 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%33 : tensor<4xf32>) outs(%34 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %36 = tensor.empty() : tensor<4xf32>
    %37 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%33 : tensor<4xf32>) outs(%36 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %38 = tensor.empty() : tensor<4xf32>
    %39 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %33 : tensor<4xf32>, tensor<4xf32>) outs(%38 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %40 = tensor.empty() : tensor<4xf32>
    %41 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%33 : tensor<4xf32>) outs(%40 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %42 = tensor.empty() : tensor<4xf32>
    %43 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%39, %41 : tensor<4xf32>, tensor<4xf32>) outs(%42 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %44 = tensor.empty() : tensor<4xf32>
    %45 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%35, %37, %43 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%44 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %46 = tensor.empty() : tensor<4xi1>
    %47 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%46 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %48 = tensor.empty() : tensor<4xf32>
    %49 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%48 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %50 = tensor.empty() : tensor<4xf32>
    %51 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %45 : tensor<4xf32>, tensor<4xf32>) outs(%50 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %52 = tensor.empty() : tensor<4xf32>
    %53 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%45 : tensor<4xf32>) outs(%52 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %54 = tensor.empty() : tensor<4xf32>
    %55 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%51, %53 : tensor<4xf32>, tensor<4xf32>) outs(%54 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %56 = tensor.empty() : tensor<4xf32>
    %57 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%47, %49, %55 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%56 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %58 = tensor.empty() : tensor<4xi1>
    %59 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%57 : tensor<4xf32>) outs(%58 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %60 = tensor.empty() : tensor<4xf32>
    %61 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%57 : tensor<4xf32>) outs(%60 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %62 = tensor.empty() : tensor<4xf32>
    %63 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %57 : tensor<4xf32>, tensor<4xf32>) outs(%62 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %64 = tensor.empty() : tensor<4xf32>
    %65 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%57 : tensor<4xf32>) outs(%64 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %66 = tensor.empty() : tensor<4xf32>
    %67 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%63, %65 : tensor<4xf32>, tensor<4xf32>) outs(%66 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %68 = tensor.empty() : tensor<4xf32>
    %69 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%59, %61, %67 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%68 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %70 = tensor.empty() : tensor<4xi1>
    %71 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%69 : tensor<4xf32>) outs(%70 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %72 = tensor.empty() : tensor<4xf32>
    %73 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%69 : tensor<4xf32>) outs(%72 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %74 = tensor.empty() : tensor<4xf32>
    %75 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %69 : tensor<4xf32>, tensor<4xf32>) outs(%74 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %76 = tensor.empty() : tensor<4xf32>
    %77 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%69 : tensor<4xf32>) outs(%76 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %78 = tensor.empty() : tensor<4xf32>
    %79 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%75, %77 : tensor<4xf32>, tensor<4xf32>) outs(%78 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %80 = tensor.empty() : tensor<4xf32>
    %81 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%71, %73, %79 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%80 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %82 = tensor.empty() : tensor<4xi1>
    %83 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<4xf32>) outs(%82 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %84 = tensor.empty() : tensor<4xf32>
    %85 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<4xf32>) outs(%84 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %86 = tensor.empty() : tensor<4xf32>
    %87 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %81 : tensor<4xf32>, tensor<4xf32>) outs(%86 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %88 = tensor.empty() : tensor<4xf32>
    %89 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<4xf32>) outs(%88 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %90 = tensor.empty() : tensor<4xf32>
    %91 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%87, %89 : tensor<4xf32>, tensor<4xf32>) outs(%90 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %92 = tensor.empty() : tensor<4xf32>
    %93 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%83, %85, %91 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%92 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %94 = tensor.empty() : tensor<4xi1>
    %95 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%93 : tensor<4xf32>) outs(%94 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %96 = tensor.empty() : tensor<4xf32>
    %97 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%93 : tensor<4xf32>) outs(%96 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %98 = tensor.empty() : tensor<4xf32>
    %99 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %93 : tensor<4xf32>, tensor<4xf32>) outs(%98 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %100 = tensor.empty() : tensor<4xf32>
    %101 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%93 : tensor<4xf32>) outs(%100 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %102 = tensor.empty() : tensor<4xf32>
    %103 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%99, %101 : tensor<4xf32>, tensor<4xf32>) outs(%102 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %104 = tensor.empty() : tensor<4xf32>
    %105 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%95, %97, %103 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%104 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %106 = tensor.empty() : tensor<4xi1>
    %107 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%106 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %108 = tensor.empty() : tensor<4xf32>
    %109 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%108 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %110 = tensor.empty() : tensor<4xf32>
    %111 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %105 : tensor<4xf32>, tensor<4xf32>) outs(%110 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %112 = tensor.empty() : tensor<4xf32>
    %113 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%105 : tensor<4xf32>) outs(%112 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %114 = tensor.empty() : tensor<4xf32>
    %115 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%111, %113 : tensor<4xf32>, tensor<4xf32>) outs(%114 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %116 = tensor.empty() : tensor<4xf32>
    %117 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%107, %109, %115 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%116 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %118 = tensor.empty() : tensor<4xi1>
    %119 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%117 : tensor<4xf32>) outs(%118 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %120 = tensor.empty() : tensor<4xf32>
    %121 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%117 : tensor<4xf32>) outs(%120 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %122 = tensor.empty() : tensor<4xf32>
    %123 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %117 : tensor<4xf32>, tensor<4xf32>) outs(%122 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %124 = tensor.empty() : tensor<4xf32>
    %125 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%117 : tensor<4xf32>) outs(%124 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %126 = tensor.empty() : tensor<4xf32>
    %127 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%123, %125 : tensor<4xf32>, tensor<4xf32>) outs(%126 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %128 = tensor.empty() : tensor<4xf32>
    %129 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%119, %121, %127 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%128 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %130 = tensor.empty() : tensor<4xi1>
    %131 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<4xf32>) outs(%130 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %132 = tensor.empty() : tensor<4xf32>
    %133 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<4xf32>) outs(%132 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %134 = tensor.empty() : tensor<4xf32>
    %135 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %129 : tensor<4xf32>, tensor<4xf32>) outs(%134 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %136 = tensor.empty() : tensor<4xf32>
    %137 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<4xf32>) outs(%136 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %138 = tensor.empty() : tensor<4xf32>
    %139 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%135, %137 : tensor<4xf32>, tensor<4xf32>) outs(%138 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %140 = tensor.empty() : tensor<4xf32>
    %141 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%131, %133, %139 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%140 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %142 = tensor.empty() : tensor<4xi1>
    %143 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%141 : tensor<4xf32>) outs(%142 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %144 = tensor.empty() : tensor<4xf32>
    %145 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%141 : tensor<4xf32>) outs(%144 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %146 = tensor.empty() : tensor<4xf32>
    %147 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %141 : tensor<4xf32>, tensor<4xf32>) outs(%146 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %148 = tensor.empty() : tensor<4xf32>
    %149 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%141 : tensor<4xf32>) outs(%148 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %150 = tensor.empty() : tensor<4xf32>
    %151 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%147, %149 : tensor<4xf32>, tensor<4xf32>) outs(%150 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %152 = tensor.empty() : tensor<4xf32>
    %153 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%143, %145, %151 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%152 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %154 = tensor.empty() : tensor<4xi1>
    %155 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%153 : tensor<4xf32>) outs(%154 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %156 = tensor.empty() : tensor<4xf32>
    %157 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%153 : tensor<4xf32>) outs(%156 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %158 = tensor.empty() : tensor<4xf32>
    %159 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %153 : tensor<4xf32>, tensor<4xf32>) outs(%158 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %160 = tensor.empty() : tensor<4xf32>
    %161 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%153 : tensor<4xf32>) outs(%160 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %162 = tensor.empty() : tensor<4xf32>
    %163 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%159, %161 : tensor<4xf32>, tensor<4xf32>) outs(%162 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %164 = tensor.empty() : tensor<4xf32>
    %165 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%155, %157, %163 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%164 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %166 = tensor.empty() : tensor<4xi1>
    %167 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%165 : tensor<4xf32>) outs(%166 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %168 = tensor.empty() : tensor<4xf32>
    %169 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%165 : tensor<4xf32>) outs(%168 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %170 = tensor.empty() : tensor<4xf32>
    %171 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %165 : tensor<4xf32>, tensor<4xf32>) outs(%170 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %172 = tensor.empty() : tensor<4xf32>
    %173 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%165 : tensor<4xf32>) outs(%172 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %174 = tensor.empty() : tensor<4xf32>
    %175 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%171, %173 : tensor<4xf32>, tensor<4xf32>) outs(%174 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %176 = tensor.empty() : tensor<4xf32>
    %177 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%167, %169, %175 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%176 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %178 = tensor.empty() : tensor<4xi1>
    %179 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%177 : tensor<4xf32>) outs(%178 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %180 = tensor.empty() : tensor<4xf32>
    %181 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%177 : tensor<4xf32>) outs(%180 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %182 = tensor.empty() : tensor<4xf32>
    %183 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %177 : tensor<4xf32>, tensor<4xf32>) outs(%182 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %184 = tensor.empty() : tensor<4xf32>
    %185 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%177 : tensor<4xf32>) outs(%184 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %186 = tensor.empty() : tensor<4xf32>
    %187 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%183, %185 : tensor<4xf32>, tensor<4xf32>) outs(%186 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %188 = tensor.empty() : tensor<4xf32>
    %189 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%179, %181, %187 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%188 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %190 = tensor.empty() : tensor<4xi1>
    %191 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%190 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %192 = tensor.empty() : tensor<4xf32>
    %193 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%192 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %194 = tensor.empty() : tensor<4xf32>
    %195 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %189 : tensor<4xf32>, tensor<4xf32>) outs(%194 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %196 = tensor.empty() : tensor<4xf32>
    %197 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<4xf32>) outs(%196 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %198 = tensor.empty() : tensor<4xf32>
    %199 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%195, %197 : tensor<4xf32>, tensor<4xf32>) outs(%198 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %200 = tensor.empty() : tensor<4xf32>
    %201 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%191, %193, %199 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%200 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %202 = tensor.empty() : tensor<4xi1>
    %203 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%201 : tensor<4xf32>) outs(%202 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %204 = tensor.empty() : tensor<4xf32>
    %205 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%201 : tensor<4xf32>) outs(%204 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %206 = tensor.empty() : tensor<4xf32>
    %207 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %201 : tensor<4xf32>, tensor<4xf32>) outs(%206 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %208 = tensor.empty() : tensor<4xf32>
    %209 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%201 : tensor<4xf32>) outs(%208 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %210 = tensor.empty() : tensor<4xf32>
    %211 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%207, %209 : tensor<4xf32>, tensor<4xf32>) outs(%210 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %212 = tensor.empty() : tensor<4xf32>
    %213 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%203, %205, %211 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%212 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %214 = tensor.empty() : tensor<4xi1>
    %215 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%213 : tensor<4xf32>) outs(%214 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %216 = tensor.empty() : tensor<4xf32>
    %217 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%213 : tensor<4xf32>) outs(%216 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %218 = tensor.empty() : tensor<4xf32>
    %219 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %213 : tensor<4xf32>, tensor<4xf32>) outs(%218 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %220 = tensor.empty() : tensor<4xf32>
    %221 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%213 : tensor<4xf32>) outs(%220 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %222 = tensor.empty() : tensor<4xf32>
    %223 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%219, %221 : tensor<4xf32>, tensor<4xf32>) outs(%222 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %224 = tensor.empty() : tensor<4xf32>
    %225 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%215, %217, %223 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%224 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %226 = tensor.empty() : tensor<4xi1>
    %227 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%225 : tensor<4xf32>) outs(%226 : tensor<4xi1>) {
    ^bb0(%in: f32, %out: i1):
      %238 = arith.cmpf ogt, %in, %cst_1 : f32
      linalg.yield %238 : i1
    } -> tensor<4xi1>
    %228 = tensor.empty() : tensor<4xf32>
    %229 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%225 : tensor<4xf32>) outs(%228 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.negf %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %230 = tensor.empty() : tensor<4xf32>
    %231 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %225 : tensor<4xf32>, tensor<4xf32>) outs(%230 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %232 = tensor.empty() : tensor<4xf32>
    %233 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%225 : tensor<4xf32>) outs(%232 : tensor<4xf32>) {
    ^bb0(%in: f32, %out: f32):
      %238 = arith.subf %cst_0, %in : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %234 = tensor.empty() : tensor<4xf32>
    %235 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%231, %233 : tensor<4xf32>, tensor<4xf32>) outs(%234 : tensor<4xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %238 = arith.mulf %in, %in_2 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    %236 = tensor.empty() : tensor<4xf32>
    %237 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel"]} ins(%227, %229, %235 : tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) outs(%236 : tensor<4xf32>) {
    ^bb0(%in: i1, %in_2: f32, %in_3: f32, %out: f32):
      %238 = arith.select %in, %in_2, %in_3 : f32
      linalg.yield %238 : f32
    } -> tensor<4xf32>
    return %237 : tensor<4xf32>
  }
}
	set -ex

	. ../env/bin/activate

	python3 jax_test.py

	iree-opt ./jax.stablehlo.mlir \
	--iree-stablehlo-input-transformation-pipeline \
	--convert-scf-to-cf \
	> jax.linalg.mlir

	streamhls-opt ./jax.linalg.mlir \
	-streamhls-host-pipeline \
	> host.mlir

	streamhls-translate ./host.mlir \
	-emit-vivado-hls \
	-vitis-hls-weights-dir=data \
	-vitis-hls-is-host=true \
	-o host_tb.cpp

	# put top-func=main
	streamhls-opt jax.linalg.mlir \
	-streamhls-kernel-pipeline="top-func=main \
	graph-file=graph\
	report-file=report\
	optimize-schedule=1\
	parallelize-nodes=1\
	combined-optimization=0\
	board-dsps=1024 \
	tiling-limit=16 \
	time-limit-minutes=10 \
	bufferize-func-args=0 \
	optimize-conv-reuse=0 \
	minimize-on-chip-buffers=0 \
	debug-point=14" > kernel.mlir


	streamhls-translate \
	kernel.mlir \
	-emit-vivado-hls \
	-o kernel.cpp

	from jax import numpy as jnp
	from jax import jit
	import jax

	def f(rs):
	o = jax.vmap(lambda r:
	jax.lax.scan(lambda x, _:
	(jax.lax.select(
	x > 0.5,
	-x,
	rx(1-x)),
	x)
	, 0.5, length=20, unroll=True)[0]
	)(rs)
	return o
	# return jax.lax.select(x > 3, x - 5, x + 5)
	r# eturn x + 1

	x = jnp.ones((4,))

	mlir = jax.jit(f).lower(x).as_text()

	with open('jax.stablehlo.mlir', 'w') as f:
	print(mlir.replace('@main(', '@forward('), file=f)
	print(mlir)