Raph Levien raphlinus

## half_experiment.patch
commit 49309841ba5cb7db0989a87c0a1d4b2947f02314
Merge: 3abe852 de4ec3f
Author: Raph Levien <raph@google.com>
Date:   Wed Jun 11 08:12:42 2025 -0700

    On gen2: half experiment

diff --cc fearless_simd/src/generated/simd_trait.rs
index 875ccad,875ccad..544068b
--- a/fearless_simd/src/generated/simd_trait.rs

## kbound.rs
use kurbo::{CubicBez, ParamCurveDeriv};

/// Compute bounds on curvature
pub fn kbound(c: CubicBez) -> (f64, f64) {
    let q = c.deriv();
    let p1xp0 = q.p1.to_vec2().cross(q.p0.to_vec2());
    let p2xp0 = q.p2.to_vec2().cross(q.p0.to_vec2());
    let p2xp1 = q.p2.to_vec2().cross(q.p1.to_vec2());
    let c0 = 2. * p1xp0;
    let c1 = 2. * (p2xp0 - 2.0 * p1xp0);

## neo_flatten.rs
use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4};

#[repr(C)]
#[derive(Clone, Copy, Debug)]
struct Point {
    x: f32,
    y: f32,
}

impl Point {

## neon_flatten.rs
unsafe fn approx_parabola_integral(x: float32x4_t) -> float32x4_t {
    const D: f32 = 0.67;
    let x2 = vmulq_f32(x, x);
    let t1 = vfmaq_f32(vdupq_n_f32(D.powi(4)), vdupq_n_f32(0.25), x2);
    let t1_sqrt = vsqrtq_f32(t1);
    let t1_fourthroot = vsqrtq_f32(t1_sqrt);
    let denom = vaddq_f32(vdupq_n_f32(1.0 - D), t1_fourthroot);
    vdivq_f32(x, denom)
}

## hello_stickshift.rs
// Input:
// fn foo(x: [f32; 4], y: f32) -> [f32; 4] {
//     x; x * (x - 2.0) * y
// }
fn foo(x: [f32; 4], y: f32) -> [f32; 4] {
    unsafe {
        let v__0 = ::core::mem::transmute::<[f32; 4usize], ::core::arch::aarch64::float32x4_t>(x);
        let v__1 = y;
        let v__2 = v__0;
        let v__3 = v__0;

## flatten.rs
// Copyright 2025 the Fearless_SIMD Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

//! Example of fast flattening cubic Beziers.

// Arguably we should just take a kurbo dep (or do development
// in another crate), but here we can

use core::f32;

## ordered_queue_test.rs
use std::time::Duration;
use rand::Rng;

fn main() {
    let (s, mut r) = ordered_channel::bounded(10);
    for i in 0..100 {
        let s_clone = s.clone();
        rayon::spawn_fifo(move || {
            let mut rng = rand::thread_rng();
            let sleep_time = rng.gen_range(0..100);

## neon_to_srgb.rs
// Copyright 2024 the Color Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[inline(never)]
pub unsafe fn to_srgb(rgba: [f32; 4]) -> [f32; 4] {
    let v = aarch64::vld1q_f32(rgba.as_ptr());
    let vabs = aarch64::vabsq_f32(v);
    let bias = aarch64::vdupq_n_f32(-5.35862651e-04);

## simd_reduce_test.rs
// run with `RUSTFLAGS='-C target-cpu=native' cargo +nightly bench`

#![feature(test)]

fn main() {
    let mut a = [0u32; 65536];
    a[1] = 42;
    println!("{}", scalar_max(&a));
    println!("{}", avx2_max(&a));
}

## gist:5aca9de53f9d6b24933cb24d8a60df63
1	s_version 0x4004                                                                               	4	0.01	2
2	s_inst_prefetch 0x3                                                                            	4	0.01	1
3	s_getpc_b64 s[0:1]                                                                             	4	0.03	5
4	s_mov_b32 s0, s2                                                                               	4	0.05	9
5	s_load_dwordx4 s[4:7], s[0:1], null                                                            	4	0.01	1
6	s_load_dwordx4 s[12:15], s[0:1], 0x20                                                          	4	0.01	1
7	s_load_dwordx4 s[16:19], s[0:1], 0x40                                                          	4	0.01	1
8	v_lshl_add_u32 v3, s8, 8, v0                                                                   	4	0.03	5
9	v_lshrrev_b32_e32 v0, 2, v3                                                                    	4	0.01	1
10	s_waitcnt lgkmcnt(0)
	commit 49309841ba5cb7db0989a87c0a1d4b2947f02314
	Merge: 3abe852 de4ec3f
	Author: Raph Levien <raph@google.com>
	Date: Wed Jun 11 08:12:42 2025 -0700

	On gen2: half experiment

	diff --cc fearless_simd/src/generated/simd_trait.rs
	index 875ccad,875ccad..544068b
	--- a/fearless_simd/src/generated/simd_trait.rs
	use kurbo::{CubicBez, ParamCurveDeriv};

	/// Compute bounds on curvature
	pub fn kbound(c: CubicBez) -> (f64, f64) {
	let q = c.deriv();
	let p1xp0 = q.p1.to_vec2().cross(q.p0.to_vec2());
	let p2xp0 = q.p2.to_vec2().cross(q.p0.to_vec2());
	let p2xp1 = q.p2.to_vec2().cross(q.p1.to_vec2());
	let c0 = 2. * p1xp0;
	let c1 = 2. * (p2xp0 - 2.0 * p1xp0);
	use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4};

	#[repr(C)]
	#[derive(Clone, Copy, Debug)]
	struct Point {
	x: f32,
	y: f32,
	}

	impl Point {
	unsafe fn approx_parabola_integral(x: float32x4_t) -> float32x4_t {
	const D: f32 = 0.67;
	let x2 = vmulq_f32(x, x);
	let t1 = vfmaq_f32(vdupq_n_f32(D.powi(4)), vdupq_n_f32(0.25), x2);
	let t1_sqrt = vsqrtq_f32(t1);
	let t1_fourthroot = vsqrtq_f32(t1_sqrt);
	let denom = vaddq_f32(vdupq_n_f32(1.0 - D), t1_fourthroot);
	vdivq_f32(x, denom)
	}
	// Input:
	// fn foo(x: [f32; 4], y: f32) -> [f32; 4] {
	// x; x * (x - 2.0) * y
	// }
	fn foo(x: [f32; 4], y: f32) -> [f32; 4] {
	unsafe {
	let v__0 = ::core::mem::transmute::<[f32; 4usize], ::core::arch::aarch64::float32x4_t>(x);
	let v__1 = y;
	let v__2 = v__0;
	let v__3 = v__0;
	// Copyright 2025 the Fearless_SIMD Authors
	// SPDX-License-Identifier: Apache-2.0 OR MIT

	//! Example of fast flattening cubic Beziers.

	// Arguably we should just take a kurbo dep (or do development
	// in another crate), but here we can

	use core::f32;
	use std::time::Duration;
	use rand::Rng;

	fn main() {
	let (s, mut r) = ordered_channel::bounded(10);
	for i in 0..100 {
	let s_clone = s.clone();
	rayon::spawn_fifo(move \|\| {
	let mut rng = rand::thread_rng();
	let sleep_time = rng.gen_range(0..100);
	// Copyright 2024 the Color Authors
	// SPDX-License-Identifier: Apache-2.0 OR MIT

	#[cfg(target_arch = "aarch64")]
	#[target_feature(enable = "neon")]
	#[inline(never)]
	pub unsafe fn to_srgb(rgba: [f32; 4]) -> [f32; 4] {
	let v = aarch64::vld1q_f32(rgba.as_ptr());
	let vabs = aarch64::vabsq_f32(v);
	let bias = aarch64::vdupq_n_f32(-5.35862651e-04);
	// run with `RUSTFLAGS='-C target-cpu=native' cargo +nightly bench`

	#![feature(test)]

	fn main() {
	let mut a = [0u32; 65536];
	a[1] = 42;
	println!("{}", scalar_max(&a));
	println!("{}", avx2_max(&a));
	}
	1 s_version 0x4004 4 0.01 2
	2 s_inst_prefetch 0x3 4 0.01 1
	3 s_getpc_b64 s[0:1] 4 0.03 5
	4 s_mov_b32 s0, s2 4 0.05 9
	5 s_load_dwordx4 s[4:7], s[0:1], null 4 0.01 1
	6 s_load_dwordx4 s[12:15], s[0:1], 0x20 4 0.01 1
	7 s_load_dwordx4 s[16:19], s[0:1], 0x40 4 0.01 1
	8 v_lshl_add_u32 v3, s8, 8, v0 4 0.03 5
	9 v_lshrrev_b32_e32 v0, 2, v3 4 0.01 1
	10 s_waitcnt lgkmcnt(0)