Ben Vanik benvanik

## PERFORMANCE_COMPARISON.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                benvanik
                / PERFORMANCE_COMPARISON.md
            
            
              Created
              February 22, 2026 04:26
            
              
                iree/tokenizer/ perf
              
          
    Tokenizer Performance Comparison

Benchmark comparing IREE's C tokenizer against the best available Rust and Python
tokenizer implementations.
Last updated: 2026-02-21. Re-run benchmarks before citing these numbers.
Test Environment


CPU: AMD EPYC (192 cores, 5391 MHz), 32 KB L1d, 1 MB L2, 32 MB L3


## external-transients.md

      
              2 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                benvanik
                / external-transients.md
            
            
              Last active
              October 24, 2025 07:59
            
              
                External transient storage design doc
              
          
    External Transients Implementation Plan

Overview

Enable users to provide buffers for transient memory allocation in their functions, with generated query functions to calculate required sizes. This supports the kernel JIT use case where applications need control over transient allocations.
Motivation: Building a kernel JIT on top of IREE where users provide IR of their linalg ops, we compile it into dispatches, and our host code schedules it with transient allocation. Users need to control transient memory ahead of time, so we provide size query functions and let them pass storage buffers to functions (making zero allocations in steady state).

Phase 0: Foundation - ABI & HAL Layer ✅ COMPLETED


## 0-input.mlir
// tools/test/iree-run-module-multi.mlir

func.func public @multi_device_mul(
  // Input argument is resident on device_a (tooling default to first device).
  %input_a: tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
) -> (
  // Output result is expected to be on device_a (though not required).
  tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
) {
  // Compute on device_a (input is there).

## build_kernels_c.sh
set -x

~/src/iree-build/llvm-project/bin/clang \
  -x c -std=c23 \
  -target amdgcn-amd-amdhsa -march=gfx1100 \
  -nogpulib \
  -fgpu-rdc \
  -fno-short-wchar \
  -fno-ident \
  -Xclang -finclude-default-header \

## methods.mlir
    vm.import @vmvx.add.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
    vm.import @vmvx.add.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
    vm.import @vmvx.and.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
    vm.import @vmvx.div.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_st

## unidirectional_lstm.mlir
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
  return %0 : tensor<i1>
}

// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg

## launch.json
{
            "name": "(gdb) iree-compile",
            "type": "cppdbg",
            "request": "launch",
            "preLaunchTask": "build-iree-compile",
            "program": "${command:cmake.buildDirectory}/tools/iree-compile",
            "args": [
                // "-iree-vm-bytecode-module-output-format=annotated-mlir-text",
                "-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir",
                "-iree-vm-emit-polyglot-zip=true",

## api_interfaces_cc.h
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,

## 0-static-input.mlir
// Simple dispatch of static shapes.
func @staticShapeDispatch(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
  %x = constant 100 : index
  %y = constant 50 : index
  // %x, %y here are the workgroup counts along a 2D grid to dispatch; backends turn them into 3D XYZ.
  %0 = flow.dispatch.workgroups[%x, %y](%arg0) : (tensor<8x4xf32>) -> (tensor<4x8xf32>) = (
    // I/O are modeled in the region as ref arguments that have some special ops available.
    %arg : !flow.dispatch.input<8x4xf32>, %ret : !flow.dispatch.output<4x8xf32>
  ) {
    // Loads a tensor from an input; can be tiled with offsets/sizes/strides.

## dispatch_workgroups.mlir
// RUN: iree-opt -allow-unregistered-dialect -split-input-file %s | iree-opt -allow-unregistered-dialect -split-input-file | IreeFileCheck %s

func @workgroups(%arg0 : tensor<?x4xf32>, %arg1 : index) -> tensor<4x?xf32> {
  %x = constant 100 : index
  %y = constant 50 : index
  %0 = flow.dispatch.workgroups[%x, %y](%arg0, %arg1) : (tensor<?x4xf32>, index) -> (tensor<4x?xf32>) =
  (%arg0_capture : !flow.dispatch.input<?x4xf32>, %arg1_capture : index, %ret0 : !flow.dispatch.output<4x?xf32>) {

    // Query symbolic workgroup info:
	// tools/test/iree-run-module-multi.mlir

	func.func public @multi_device_mul(
	// Input argument is resident on device_a (tooling default to first device).
	%input_a: tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
	) -> (
	// Output result is expected to be on device_a (though not required).
	tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
	) {
	// Compute on device_a (input is there).
	set -x

	~/src/iree-build/llvm-project/bin/clang \
	-x c -std=c23 \
	-target amdgcn-amd-amdhsa -march=gfx1100 \
	-nogpulib \
	-fgpu-rdc \
	-fno-short-wchar \
	-fno-ident \
	-Xclang -finclude-default-header \
	vm.import @vmvx.add.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
	vm.import @vmvx.add.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
	vm.import @vmvx.and.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
	vm.import @vmvx.div.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_st
	// -----// IR Dump After TopLevelSCFToCFG //----- //
	func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
	%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
	return %0 : tensor<i1>
	}

	// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
	func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg
	{
	"name": "(gdb) iree-compile",
	"type": "cppdbg",
	"request": "launch",
	"preLaunchTask": "build-iree-compile",
	"program": "${command:cmake.buildDirectory}/tools/iree-compile",
	"args": [
	// "-iree-vm-bytecode-module-output-format=annotated-mlir-text",
	"-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir",
	"-iree-vm-emit-polyglot-zip=true",
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// Simple dispatch of static shapes.
	func @staticShapeDispatch(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
	%x = constant 100 : index
	%y = constant 50 : index
	// %x, %y here are the workgroup counts along a 2D grid to dispatch; backends turn them into 3D XYZ.
	%0 = flow.dispatch.workgroups[%x, %y](%arg0) : (tensor<8x4xf32>) -> (tensor<4x8xf32>) = (
	// I/O are modeled in the region as ref arguments that have some special ops available.
	%arg : !flow.dispatch.input<8x4xf32>, %ret : !flow.dispatch.output<4x8xf32>
	) {
	// Loads a tensor from an input; can be tiled with offsets/sizes/strides.
	// RUN: iree-opt -allow-unregistered-dialect -split-input-file %s \| iree-opt -allow-unregistered-dialect -split-input-file \| IreeFileCheck %s

	func @workgroups(%arg0 : tensor<?x4xf32>, %arg1 : index) -> tensor<4x?xf32> {
	%x = constant 100 : index
	%y = constant 50 : index
	%0 = flow.dispatch.workgroups[%x, %y](%arg0, %arg1) : (tensor<?x4xf32>, index) -> (tensor<4x?xf32>) =
	(%arg0_capture : !flow.dispatch.input<?x4xf32>, %arg1_capture : index, %ret0 : !flow.dispatch.output<4x?xf32>) {

	// Query symbolic workgroup info: