Benchmark comparing IREE's C tokenizer against the best available Rust and Python tokenizer implementations.
Last updated: 2026-02-21. Re-run benchmarks before citing these numbers.
- CPU: AMD EPYC (192 cores, 5391 MHz), 32 KB L1d, 1 MB L2, 32 MB L3
Enable users to provide buffers for transient memory allocation in their functions, with generated query functions to calculate required sizes. This supports the kernel JIT use case where applications need control over transient allocations.
Motivation: Building a kernel JIT on top of IREE where users provide IR of their linalg ops, we compile it into dispatches, and our host code schedules it with transient allocation. Users need to control transient memory ahead of time, so we provide size query functions and let them pass storage buffers to functions (making zero allocations in steady state).
| // tools/test/iree-run-module-multi.mlir | |
| func.func public @multi_device_mul( | |
| // Input argument is resident on device_a (tooling default to first device). | |
| %input_a: tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>} | |
| ) -> ( | |
| // Output result is expected to be on device_a (though not required). | |
| tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>} | |
| ) { | |
| // Compute on device_a (input is there). |
| set -x | |
| ~/src/iree-build/llvm-project/bin/clang \ | |
| -x c -std=c23 \ | |
| -target amdgcn-amd-amdhsa -march=gfx1100 \ | |
| -nogpulib \ | |
| -fgpu-rdc \ | |
| -fno-short-wchar \ | |
| -fno-ident \ | |
| -Xclang -finclude-default-header \ |
| vm.import @vmvx.add.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>) | |
| vm.import @vmvx.add.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>) | |
| vm.import @vmvx.and.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>) | |
| vm.import @vmvx.div.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_st |
| // -----// IR Dump After TopLevelSCFToCFG //----- // | |
| func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> { | |
| %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1> | |
| return %0 : tensor<i1> | |
| } | |
| // -----// IR Dump After MHLOToMHLOPreprocessing //----- // | |
| func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg |
| { | |
| "name": "(gdb) iree-compile", | |
| "type": "cppdbg", | |
| "request": "launch", | |
| "preLaunchTask": "build-iree-compile", | |
| "program": "${command:cmake.buildDirectory}/tools/iree-compile", | |
| "args": [ | |
| // "-iree-vm-bytecode-module-output-format=annotated-mlir-text", | |
| "-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir", | |
| "-iree-vm-emit-polyglot-zip=true", |
| // Copyright 2020 Google LLC | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // https://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // Simple dispatch of static shapes. | |
| func @staticShapeDispatch(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> { | |
| %x = constant 100 : index | |
| %y = constant 50 : index | |
| // %x, %y here are the workgroup counts along a 2D grid to dispatch; backends turn them into 3D XYZ. | |
| %0 = flow.dispatch.workgroups[%x, %y](%arg0) : (tensor<8x4xf32>) -> (tensor<4x8xf32>) = ( | |
| // I/O are modeled in the region as ref arguments that have some special ops available. | |
| %arg : !flow.dispatch.input<8x4xf32>, %ret : !flow.dispatch.output<4x8xf32> | |
| ) { | |
| // Loads a tensor from an input; can be tiled with offsets/sizes/strides. |
| // RUN: iree-opt -allow-unregistered-dialect -split-input-file %s | iree-opt -allow-unregistered-dialect -split-input-file | IreeFileCheck %s | |
| func @workgroups(%arg0 : tensor<?x4xf32>, %arg1 : index) -> tensor<4x?xf32> { | |
| %x = constant 100 : index | |
| %y = constant 50 : index | |
| %0 = flow.dispatch.workgroups[%x, %y](%arg0, %arg1) : (tensor<?x4xf32>, index) -> (tensor<4x?xf32>) = | |
| (%arg0_capture : !flow.dispatch.input<?x4xf32>, %arg1_capture : index, %ret0 : !flow.dispatch.output<4x?xf32>) { | |
| // Query symbolic workgroup info: |