Skip to content

Instantly share code, notes, and snippets.

@leonardoalt
Created March 3, 2026 08:17
Show Gist options
  • Select an option

  • Save leonardoalt/09fd3d60bd571851bb656dc53cec0a4b to your computer and use it in GitHub Desktop.

Select an option

Save leonardoalt/09fd3d60bd571851bb656dc53cec0a4b to your computer and use it in GitHub Desktop.
Diffs of womir CUDA files vs OpenVM rv32im originals
--- /home/leo/devel/openvm/extensions/rv32im/circuit/cuda/include/rv32im/adapters/alu.cuh 2026-03-02 14:57:12.776007921 +0100
+++ /home/leo/devel/womir-openvm/extensions/womir_circuit/cuda/include/womir/adapters/alu.cuh 2026-03-02 15:50:33.500343846 +0100
@@ -7,63 +7,81 @@
using namespace riscv;
-template <typename T> struct Rv32BaseAluAdapterCols {
- ExecutionState<T> from_state; // { pub pc: T, pub timestamp: T}
+// WOMIR ExecutionState includes frame pointer (fp) between pc and timestamp.
+template <typename T> struct WomirExecutionState {
+ T pc;
+ T fp;
+ T timestamp;
+};
+
+template <typename T> struct WomirBaseAluAdapterCols {
+ WomirExecutionState<T> from_state;
T rd_ptr;
T rs1_ptr;
T rs2; // Pointer if rs2 was a read, immediate value otherwise
T rs2_as; // 1 if rs2 was a read, 0 if an immediate
+ MemoryReadAuxCols<T> fp_read_aux;
MemoryReadAuxCols<T> reads_aux[2];
MemoryWriteAuxCols<T, RV32_REGISTER_NUM_LIMBS> writes_aux;
};
-struct Rv32BaseAluAdapterRecord {
+struct WomirBaseAluAdapterRecord {
uint32_t from_pc;
+ uint32_t fp;
uint32_t from_timestamp;
uint32_t rd_ptr;
uint32_t rs1_ptr;
uint32_t rs2; // Pointer if rs2 was a read, immediate value otherwise
uint8_t rs2_as; // 1 if rs2 was a read, 0 if an immediate
+ MemoryReadAuxRecord fp_read_aux;
MemoryReadAuxRecord reads_aux[2];
MemoryWriteBytesAuxRecord<RV32_REGISTER_NUM_LIMBS> writes_aux;
};
-struct Rv32BaseAluAdapter {
+struct WomirBaseAluAdapter {
MemoryAuxColsFactory mem_helper;
BitwiseOperationLookup bitwise_lookup;
- __device__ Rv32BaseAluAdapter(
+ __device__ WomirBaseAluAdapter(
VariableRangeChecker range_checker,
BitwiseOperationLookup lookup,
uint32_t timestamp_max_bits
)
: mem_helper(range_checker, timestamp_max_bits), bitwise_lookup(lookup) {}
- __device__ void fill_trace_row(RowSlice row, Rv32BaseAluAdapterRecord record) {
- COL_WRITE_VALUE(row, Rv32BaseAluAdapterCols, from_state.pc, record.from_pc);
- COL_WRITE_VALUE(row, Rv32BaseAluAdapterCols, from_state.timestamp, record.from_timestamp);
-
- COL_WRITE_VALUE(row, Rv32BaseAluAdapterCols, rd_ptr, record.rd_ptr);
- COL_WRITE_VALUE(row, Rv32BaseAluAdapterCols, rs1_ptr, record.rs1_ptr);
- COL_WRITE_VALUE(row, Rv32BaseAluAdapterCols, rs2, record.rs2);
- COL_WRITE_VALUE(row, Rv32BaseAluAdapterCols, rs2_as, record.rs2_as);
+ __device__ void fill_trace_row(RowSlice row, WomirBaseAluAdapterRecord record) {
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, from_state.pc, record.from_pc);
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, from_state.fp, record.fp);
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, from_state.timestamp, record.from_timestamp);
+
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, rd_ptr, record.rd_ptr);
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, rs1_ptr, record.rs1_ptr);
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, rs2, record.rs2);
+ COL_WRITE_VALUE(row, WomirBaseAluAdapterCols, rs2_as, record.rs2_as);
- // Read auxiliary for rs1
+ // Read auxiliary for fp (at from_timestamp + 0)
mem_helper.fill(
- row.slice_from(COL_INDEX(Rv32BaseAluAdapterCols, reads_aux[0])),
- record.reads_aux[0].prev_timestamp,
+ row.slice_from(COL_INDEX(WomirBaseAluAdapterCols, fp_read_aux)),
+ record.fp_read_aux.prev_timestamp,
record.from_timestamp
);
+ // Read auxiliary for rs1 (at from_timestamp + 1)
+ mem_helper.fill(
+ row.slice_from(COL_INDEX(WomirBaseAluAdapterCols, reads_aux[0])),
+ record.reads_aux[0].prev_timestamp,
+ record.from_timestamp + 1
+ );
+
// rs2: register read when rs2_as == RV32_REGISTER_AS (== 1), otherwise immediate.
if (record.rs2_as != 0) {
mem_helper.fill(
- row.slice_from(COL_INDEX(Rv32BaseAluAdapterCols, reads_aux[1])),
+ row.slice_from(COL_INDEX(WomirBaseAluAdapterCols, reads_aux[1])),
record.reads_aux[1].prev_timestamp,
- record.from_timestamp + 1
+ record.from_timestamp + 2
);
} else {
- RowSlice rs2_aux = row.slice_from(COL_INDEX(Rv32BaseAluAdapterCols, reads_aux[1]));
+ RowSlice rs2_aux = row.slice_from(COL_INDEX(WomirBaseAluAdapterCols, reads_aux[1]));
#pragma unroll
for (size_t i = 0; i < sizeof(MemoryReadAuxCols<uint8_t>); i++) {
rs2_aux.write(i, 0);
@@ -73,12 +91,12 @@
}
COL_WRITE_ARRAY(
- row, Rv32BaseAluAdapterCols, writes_aux.prev_data, record.writes_aux.prev_data
+ row, WomirBaseAluAdapterCols, writes_aux.prev_data, record.writes_aux.prev_data
);
mem_helper.fill(
- row.slice_from(COL_INDEX(Rv32BaseAluAdapterCols, writes_aux)),
+ row.slice_from(COL_INDEX(WomirBaseAluAdapterCols, writes_aux)),
record.writes_aux.prev_timestamp,
- record.from_timestamp + 2
+ record.from_timestamp + 3
);
}
-};
\ No newline at end of file
+};
--- /home/leo/devel/openvm/extensions/rv32im/circuit/cuda/src/alu.cu 2026-03-02 14:57:12.776007921 +0100
+++ /home/leo/devel/womir-openvm/extensions/womir_circuit/cuda/src/alu.cu 2026-03-02 15:50:58.651475084 +0100
@@ -2,30 +2,30 @@
#include "primitives/buffer_view.cuh"
#include "primitives/constants.h"
#include "primitives/trace_access.h"
-#include "rv32im/adapters/alu.cuh"
-#include "rv32im/cores/alu.cuh"
+#include "womir/adapters/alu.cuh"
+#include "womir/cores/alu.cuh"
using namespace riscv;
// Concrete type aliases for 32-bit
-using Rv32BaseAluCoreRecord = BaseAluCoreRecord<RV32_REGISTER_NUM_LIMBS>;
-using Rv32BaseAluCore = BaseAluCore<RV32_REGISTER_NUM_LIMBS>;
-template <typename T> using Rv32BaseAluCoreCols = BaseAluCoreCols<T, RV32_REGISTER_NUM_LIMBS>;
-
-template <typename T> struct Rv32BaseAluCols {
- Rv32BaseAluAdapterCols<T> adapter;
- Rv32BaseAluCoreCols<T> core;
+using WomirBaseAluCoreRecord = BaseAluCoreRecord<RV32_REGISTER_NUM_LIMBS>;
+using WomirBaseAluCore = BaseAluCore<RV32_REGISTER_NUM_LIMBS>;
+template <typename T> using WomirBaseAluCoreCols = BaseAluCoreCols<T, RV32_REGISTER_NUM_LIMBS>;
+
+template <typename T> struct WomirBaseAluCols {
+ WomirBaseAluAdapterCols<T> adapter;
+ WomirBaseAluCoreCols<T> core;
};
-struct Rv32BaseAluRecord {
- Rv32BaseAluAdapterRecord adapter;
- Rv32BaseAluCoreRecord core;
+struct WomirBaseAluRecord {
+ WomirBaseAluAdapterRecord adapter;
+ WomirBaseAluCoreRecord core;
};
-__global__ void alu_tracegen(
+__global__ void womir_alu_tracegen(
Fp *d_trace,
size_t height,
- DeviceBufferConstView<Rv32BaseAluRecord> d_records,
+ DeviceBufferConstView<WomirBaseAluRecord> d_records,
uint32_t *d_range_checker_ptr,
size_t range_checker_bins,
uint32_t *d_bitwise_lookup_ptr,
@@ -37,25 +37,25 @@
if (idx < d_records.len()) {
auto const &rec = d_records[idx];
- Rv32BaseAluAdapter adapter(
+ WomirBaseAluAdapter adapter(
VariableRangeChecker(d_range_checker_ptr, range_checker_bins),
BitwiseOperationLookup(d_bitwise_lookup_ptr, bitwise_num_bits),
timestamp_max_bits
);
adapter.fill_trace_row(row, rec.adapter);
- Rv32BaseAluCore core(BitwiseOperationLookup(d_bitwise_lookup_ptr, bitwise_num_bits));
- core.fill_trace_row(row.slice_from(COL_INDEX(Rv32BaseAluCols, core)), rec.core);
+ WomirBaseAluCore core(BitwiseOperationLookup(d_bitwise_lookup_ptr, bitwise_num_bits));
+ core.fill_trace_row(row.slice_from(COL_INDEX(WomirBaseAluCols, core)), rec.core);
} else {
- row.fill_zero(0, sizeof(Rv32BaseAluCols<uint8_t>));
+ row.fill_zero(0, sizeof(WomirBaseAluCols<uint8_t>));
}
}
-extern "C" int _alu_tracegen(
+extern "C" int _womir_alu_tracegen(
Fp *d_trace,
size_t height,
size_t width,
- DeviceBufferConstView<Rv32BaseAluRecord> d_records,
+ DeviceBufferConstView<WomirBaseAluRecord> d_records,
uint32_t *d_range_checker_ptr,
size_t range_checker_bins,
uint32_t *d_bitwise_lookup_ptr,
@@ -64,9 +64,9 @@
) {
assert((height & (height - 1)) == 0);
assert(height >= d_records.len());
- assert(width == sizeof(Rv32BaseAluCols<uint8_t>));
+ assert(width == sizeof(WomirBaseAluCols<uint8_t>));
auto [grid, block] = kernel_launch_params(height);
- alu_tracegen<<<grid, block>>>(
+ womir_alu_tracegen<<<grid, block>>>(
d_trace,
height,
d_records,
@@ -77,4 +77,4 @@
timestamp_max_bits
);
return CHECK_KERNEL();
-}
\ No newline at end of file
+}
--- /home/leo/devel/openvm/extensions/rv32im/circuit/src/base_alu/cuda.rs 2026-03-02 14:57:12.778007934 +0100
+++ /home/leo/devel/womir-openvm/extensions/womir_circuit/src/base_alu/cuda.rs 2026-03-02 16:00:25.733430088 +0100
@@ -9,16 +9,18 @@
base::DeviceMatrix, chip::get_empty_air_proving_ctx, prover_backend::GpuBackend, types::F,
};
use openvm_cuda_common::copy::MemCopyH2D;
-use openvm_stark_backend::{prover::types::AirProvingContext, Chip};
+use openvm_rv32im_circuit::BaseAluCoreCols;
+use openvm_stark_backend::{Chip, prover::types::AirProvingContext};
use crate::{
adapters::{
- Rv32BaseAluAdapterCols, Rv32BaseAluAdapterRecord, RV32_CELL_BITS, RV32_REGISTER_NUM_LIMBS,
+ RV32_CELL_BITS, RV32_REGISTER_NUM_LIMBS, Rv32BaseAluAdapterCols, Rv32BaseAluAdapterRecord,
},
cuda_abi::alu_cuda::tracegen,
- BaseAluCoreCols, BaseAluCoreRecord,
};
+use openvm_rv32im_circuit::BaseAluCoreRecord;
+
#[derive(new)]
pub struct Rv32BaseAluChipGpu {
pub range_checker: Arc<VariableRangeCheckerChipGPU>,
--- /home/leo/devel/openvm/extensions/rv32im/circuit/src/extension/cuda.rs 2026-03-02 14:57:12.779007940 +0100
+++ /home/leo/devel/womir-openvm/extensions/womir_circuit/src/extension/cuda.rs 2026-03-02 15:55:49.012986452 +0100
@@ -1,41 +1,27 @@
-use std::sync::Arc;
-
use openvm_circuit::{
arch::{ChipInventory, ChipInventoryError, DenseRecordArena, VmProverExtension},
system::cuda::extensions::{get_inventory_range_checker, get_or_create_bitwise_op_lookup},
};
-use openvm_circuit_primitives::range_tuple::{RangeTupleCheckerAir, RangeTupleCheckerChipGPU};
use openvm_cuda_backend::{engine::GpuBabyBearPoseidon2Engine, prover_backend::GpuBackend};
use openvm_stark_sdk::config::baby_bear_poseidon2::BabyBearPoseidon2Config;
-use crate::{
- Rv32AuipcAir, Rv32AuipcChipGpu, Rv32BaseAluAir, Rv32BaseAluChipGpu, Rv32BranchEqualAir,
- Rv32BranchEqualChipGpu, Rv32BranchLessThanAir, Rv32BranchLessThanChipGpu, Rv32DivRemAir,
- Rv32DivRemChipGpu, Rv32HintStoreAir, Rv32HintStoreChipGpu, Rv32I, Rv32Io, Rv32JalLuiAir,
- Rv32JalLuiChipGpu, Rv32JalrAir, Rv32JalrChipGpu, Rv32LessThanAir, Rv32LessThanChipGpu,
- Rv32LoadSignExtendAir, Rv32LoadSignExtendChipGpu, Rv32LoadStoreAir, Rv32LoadStoreChipGpu,
- Rv32M, Rv32MulHAir, Rv32MulHChipGpu, Rv32MultiplicationAir, Rv32MultiplicationChipGpu,
- Rv32ShiftAir, Rv32ShiftChipGpu,
-};
+use crate::{Rv32BaseAluAir, Rv32BaseAluChipGpu};
+
+use super::Womir;
-pub struct Rv32ImGpuProverExt;
+pub struct WomirGpuProverExt;
-// This implementation is specific to GpuBackend because the lookup chips
-// (VariableRangeCheckerChipGPU, BitwiseOperationLookupChipGPU) are specific to GpuBackend.
-impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Rv32I> for Rv32ImGpuProverExt {
+impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Womir> for WomirGpuProverExt {
fn extend_prover(
&self,
- _: &Rv32I,
+ _: &Womir,
inventory: &mut ChipInventory<BabyBearPoseidon2Config, DenseRecordArena, GpuBackend>,
) -> Result<(), ChipInventoryError> {
- let pointer_max_bits = inventory.airs().pointer_max_bits();
let timestamp_max_bits = inventory.timestamp_max_bits();
let range_checker = get_inventory_range_checker(inventory);
let bitwise_lu = get_or_create_bitwise_op_lookup(inventory)?;
- // These calls to next_air are not strictly necessary to construct the chips, but provide a
- // safeguard to ensure that chip construction matches the circuit definition
inventory.next_air::<Rv32BaseAluAir>()?;
let base_alu = Rv32BaseAluChipGpu::new(
range_checker.clone(),
@@ -44,165 +30,7 @@
);
inventory.add_executor_chip(base_alu);
- inventory.next_air::<Rv32LessThanAir>()?;
- let lt = Rv32LessThanChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(lt);
-
- inventory.next_air::<Rv32ShiftAir>()?;
- let shift = Rv32ShiftChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(shift);
-
- inventory.next_air::<Rv32LoadStoreAir>()?;
- let load_store_chip =
- Rv32LoadStoreChipGpu::new(range_checker.clone(), pointer_max_bits, timestamp_max_bits);
- inventory.add_executor_chip(load_store_chip);
-
- inventory.next_air::<Rv32LoadSignExtendAir>()?;
- let load_sign_extend = Rv32LoadSignExtendChipGpu::new(
- range_checker.clone(),
- pointer_max_bits,
- timestamp_max_bits,
- );
- inventory.add_executor_chip(load_sign_extend);
-
- inventory.next_air::<Rv32BranchEqualAir>()?;
- let beq = Rv32BranchEqualChipGpu::new(range_checker.clone(), timestamp_max_bits);
- inventory.add_executor_chip(beq);
-
- inventory.next_air::<Rv32BranchLessThanAir>()?;
- let blt = Rv32BranchLessThanChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(blt);
-
- inventory.next_air::<Rv32JalLuiAir>()?;
- let jal_lui = Rv32JalLuiChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(jal_lui);
-
- inventory.next_air::<Rv32JalrAir>()?;
- let jalr = Rv32JalrChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(jalr);
-
- inventory.next_air::<Rv32AuipcAir>()?;
- let auipc = Rv32AuipcChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(auipc);
-
- Ok(())
- }
-}
-
-// This implementation is specific to GpuBackend because the lookup chips
-// (VariableRangeCheckerChipGPU, BitwiseOperationLookupChipGPU) are specific to GpuBackend.
-impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Rv32M> for Rv32ImGpuProverExt {
- fn extend_prover(
- &self,
- extension: &Rv32M,
- inventory: &mut ChipInventory<BabyBearPoseidon2Config, DenseRecordArena, GpuBackend>,
- ) -> Result<(), ChipInventoryError> {
- let pointer_max_bits = inventory.airs().pointer_max_bits();
- let timestamp_max_bits = inventory.timestamp_max_bits();
-
- let range_checker = get_inventory_range_checker(inventory);
- let bitwise_lu = get_or_create_bitwise_op_lookup(inventory)?;
-
- let range_tuple_checker = {
- let existing_chip = inventory
- .find_chip::<Arc<RangeTupleCheckerChipGPU<2>>>()
- .find(|c| {
- c.sizes[0] >= extension.range_tuple_checker_sizes[0]
- && c.sizes[1] >= extension.range_tuple_checker_sizes[1]
- });
- if let Some(chip) = existing_chip {
- chip.clone()
- } else {
- inventory.next_air::<RangeTupleCheckerAir<2>>()?;
- let chip = Arc::new(RangeTupleCheckerChipGPU::new(
- extension.range_tuple_checker_sizes,
- ));
- inventory.add_periphery_chip(chip.clone());
- chip
- }
- };
-
- // These calls to next_air are not strictly necessary to construct the chips, but provide a
- // safeguard to ensure that chip construction matches the circuit definition
- inventory.next_air::<Rv32MultiplicationAir>()?;
- let mult = Rv32MultiplicationChipGpu::new(
- range_checker.clone(),
- range_tuple_checker.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(mult);
-
- inventory.next_air::<Rv32MulHAir>()?;
- let mul_h = Rv32MulHChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- range_tuple_checker.clone(),
- timestamp_max_bits,
- );
- inventory.add_executor_chip(mul_h);
-
- inventory.next_air::<Rv32DivRemAir>()?;
- let div_rem = Rv32DivRemChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- range_tuple_checker.clone(),
- pointer_max_bits,
- timestamp_max_bits,
- );
- inventory.add_executor_chip(div_rem);
-
- Ok(())
- }
-}
-
-// This implementation is specific to GpuBackend because the lookup chips
-// (VariableRangeCheckerChipGPU, BitwiseOperationLookupChipGPU) are specific to GpuBackend.
-impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Rv32Io>
- for Rv32ImGpuProverExt
-{
- fn extend_prover(
- &self,
- _: &Rv32Io,
- inventory: &mut ChipInventory<BabyBearPoseidon2Config, DenseRecordArena, GpuBackend>,
- ) -> Result<(), ChipInventoryError> {
- let pointer_max_bits = inventory.airs().pointer_max_bits();
- let timestamp_max_bits = inventory.timestamp_max_bits();
-
- let range_checker = get_inventory_range_checker(inventory);
- let bitwise_lu = get_or_create_bitwise_op_lookup(inventory)?;
-
- inventory.next_air::<Rv32HintStoreAir>()?;
- let hint_store = Rv32HintStoreChipGpu::new(
- range_checker.clone(),
- bitwise_lu.clone(),
- pointer_max_bits,
- timestamp_max_bits,
- );
- inventory.add_executor_chip(hint_store);
+ // TODO: Add more WOMIR GPU chips here (64-bit ALU, mul, div, etc.)
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment