fp64/psp_vfpu.h

## psp_vfpu.h
// Copyright (c) 2012- PPSSPP Project.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License 2.0 for more details.

//==============================================================================

// NOTE: adapted from Core/MIPS/MIPSVFPUUtils.cpp
// in PPSSPP codebase (https://github.com/hrydgard/ppsspp),
// used with permission (https://github.com/hrydgard/ppsspp/issues/21070#issuecomment-3899222722).

// psp_vfpu.h - software implementations of several
// PSP vfpu functions, assumed to be bitwise-exact
// to PSP hardware.
// NOTE: beside this file you also need data tables from
// https://github.com/hrydgard/ppsspp/tree/master/assets/vfpu

// Currently the code only compiles as C++, though
// converting to C should be doable.

/*
// Usage example.
// Instantiate an implementation (in exactly ONE translation unit):
#define PSP_VFPU_IMPLEMENTATION
#include "psp_vfpu.h"

#include <stdio.h>

int main()
{
    printf("%.9g\n",vfpu_sin(0.5f)); // prints 0.70710659.
    return 0;
}
*/

//==============================================================================
// Header section.
#ifndef PSP_VFPU_H
#define PSP_VFPU_H

#ifdef PSP_VFPU_STATIC
#define PSP_VFPU_DEF static
#else
#define PSP_VFPU_DEF extern
#endif

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

PSP_VFPU_DEF void     psp_vfpu_init(void);

PSP_VFPU_DEF void     vrnd_init_default(uint32_t *rcx);
PSP_VFPU_DEF void     vrnd_init(uint32_t seed, uint32_t *rcx);
PSP_VFPU_DEF uint32_t vrnd_generate(uint32_t *rcx);

PSP_VFPU_DEF float    vfpu_sin(float x);
PSP_VFPU_DEF float    vfpu_cos(float x);
PSP_VFPU_DEF void     vfpu_sincos(float a, float &s, float &c);
PSP_VFPU_DEF float    vfpu_sqrt(float x);
PSP_VFPU_DEF float    vfpu_rsqrt(float x);
PSP_VFPU_DEF float    vfpu_asin(float x);
PSP_VFPU_DEF float    vfpu_exp2(float x);
PSP_VFPU_DEF float    vfpu_rexp2(float x);
PSP_VFPU_DEF float    vfpu_log2(float x);
PSP_VFPU_DEF float    vfpu_rcp(float x);

#ifdef __cplusplus
}
#endif

#endif // PSP_VFPU_H

//==============================================================================
// Implementation section.
#ifdef PSP_VFPU_IMPLEMENTATION

#include <string.h> // for memcpy()
#include <math.h>   // for sqrt(double)

//==============================================================================
// The code below attempts to exactly match behaviour of
// PSP's vrnd instructions. See investigation starting around
// https://github.com/hrydgard/ppsspp/issues/16946#issuecomment-1467261209
// for details.

// Redundant currently, since MIPSState::Init() already
// does this on its own, but left as-is to be self-contained.
void vrnd_init_default(uint32_t *rcx) {
	rcx[0] = 0x00000001;
	rcx[1] = 0x00000002;
	rcx[2] = 0x00000004;
	rcx[3] = 0x00000008;
	rcx[4] = 0x00000000;
	rcx[5] = 0x00000000;
	rcx[6] = 0x00000000;
	rcx[7] = 0x00000000;
}

void vrnd_init(uint32_t seed, uint32_t *rcx) {
	for(int i = 0; i < 8; ++i) rcx[i] =
		0x3F800000u |                          // 1.0f mask.
		((seed >> ((i / 4) * 16)) & 0xFFFFu) | // lower or upper half of the seed.
		(((seed >> (4 * i)) & 0xF) << 16);     // remaining nibble.

}

uint32_t vrnd_generate(uint32_t *rcx) {
	// The actual RNG state appears to be 5 parts
	// (32-bit each) stored into the registers as follows:
	uint32_t A = (rcx[0] & 0xFFFFu) | (rcx[4] << 16);
	uint32_t B = (rcx[1] & 0xFFFFu) | (rcx[5] << 16);
	uint32_t C = (rcx[2] & 0xFFFFu) | (rcx[6] << 16);
	uint32_t D = (rcx[3] & 0xFFFFu) | (rcx[7] << 16);
	uint32_t E = (((rcx[0] >> 16) & 0xF) <<  0) |
	             (((rcx[1] >> 16) & 0xF) <<  4) |
	             (((rcx[2] >> 16) & 0xF) <<  8) |
	             (((rcx[3] >> 16) & 0xF) << 12) |
	             (((rcx[4] >> 16) & 0xF) << 16) |
	             (((rcx[5] >> 16) & 0xF) << 20) |
	             (((rcx[6] >> 16) & 0xF) << 24) |
	             (((rcx[7] >> 16) & 0xF) << 28);
	// Update.
	// LCG with classic parameters.
	A = 69069u * A + 1u; // NOTE: decimal constants.
	// Xorshift, with classic parameters. Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs".
	B ^= B << 13;
	B ^= B >> 17;
	B ^= B <<  5;
	// Sequence similar to Pell numbers ( https://en.wikipedia.org/wiki/Pell_number ),
	// except with different starting values, and an occasional increment (E).
	uint32_t t= 2u * D + C + E;
	// NOTE: the details of how E-part is set are somewhat of a guess
	// at the moment. The expression below looks weird, but does match
	// the available test data.
	E = uint32_t((uint64_t(C) + uint64_t(D >> 1) + uint64_t(E)) >> 32);
	C = D;
	D = t;
	// Store.
	rcx[0] = 0x3F800000u | (((E >>  0) & 0xF) << 16) | (A & 0xFFFFu);
	rcx[1] = 0x3F800000u | (((E >>  4) & 0xF) << 16) | (B & 0xFFFFu);
	rcx[2] = 0x3F800000u | (((E >>  8) & 0xF) << 16) | (C & 0xFFFFu);
	rcx[3] = 0x3F800000u | (((E >> 12) & 0xF) << 16) | (D & 0xFFFFu);
	rcx[4] = 0x3F800000u | (((E >> 16) & 0xF) << 16) | (A >> 16);
	rcx[5] = 0x3F800000u | (((E >> 20) & 0xF) << 16) | (B >> 16);
	rcx[6] = 0x3F800000u | (((E >> 24) & 0xF) << 16) | (C >> 16);
	rcx[7] = 0x3F800000u | (((E >> 28) & 0xF) << 16) | (D >> 16);
	// Return value.
	return A + B + D;
}

//==============================================================================
// The code below attempts to exactly match the output of
// several PSP's VFPU functions. For the sake of
// making lookup tables smaller the code is
// somewhat gnarly.
// Lookup tables sometimes store deltas from (explicitly computable)
// estimations, to allow to store them in smaller types.
// See https://github.com/hrydgard/ppsspp/issues/16946 for details.

// Lookup tables.
// Note: these are never unloaded, and stay till program termination.
static uint32_t *vfpu_sin_lut8192=nullptr;
static  int8_t  (*vfpu_sin_lut_delta)[2]=nullptr;
static  int16_t *vfpu_sin_lut_interval_delta=nullptr;
static uint8_t  *vfpu_sin_lut_exceptions=nullptr;

static  int8_t  (*vfpu_sqrt_lut)[2]=nullptr;

static  int8_t  (*vfpu_rsqrt_lut)[2]=nullptr;

static uint32_t *vfpu_exp2_lut65536=nullptr;
static uint8_t  (*vfpu_exp2_lut)[2]=nullptr;

static uint32_t *vfpu_log2_lut65536=nullptr;
static uint32_t *vfpu_log2_lut65536_quadratic=nullptr;
static uint8_t  (*vfpu_log2_lut)[131072][2]=nullptr;

static  int32_t (*vfpu_asin_lut65536)[3]=nullptr;
static uint64_t *vfpu_asin_lut_deltas=nullptr;
static uint16_t *vfpu_asin_lut_indices=nullptr;

static  int8_t  (*vfpu_rcp_lut)[2]=nullptr;

#ifndef PSP_VFPU_LOAD_TABLE
#include <stdio.h>
template<typename T>
static inline bool psp_vfpu_load_table(T *&ptr, const char *filename, size_t expected_size) {
	if (ptr) return true; // Already loaded.
	fprintf(stderr, "Loading '%s'...",filename);
	if(expected_size % sizeof(T)) {
		fprintf(stderr, " FAIL (invalid expected size: %u)!\n", (unsigned)expected_size);
		return false;
	}
	FILE *f = fopen(filename, "rb");
	if(!f) return false;
	ptr = new T[expected_size / sizeof(T)];
	size_t size = fread(ptr, 1, expected_size, f);
	fclose(f);
	if (size != expected_size) {
		fprintf(stderr, " FAIL (size=%u, expected: %u)!\n", (unsigned)size, (unsigned)expected_size);
		delete[] ptr;
		ptr = nullptr;
		return false;
	}
	fprintf(stderr, " ok.\n");
	return true;
}

#define PSP_VFPU_LOAD_TABLE(name, expected_size)\
	psp_vfpu_load_table(name,#name ".dat",expected_size)
#endif // PSP_VFPU_LOAD_TABLE

#ifndef psp_vfpu_clz32
static int psp_vfpu_clz32(uint32_t x)
{
#ifdef __GNUC__
	return x==0 ? 32 : __builtin_clz(x);
#else
	// From Hacker's Delight.
	int n;

	if (x == 0) return(32);
	n = 0;
	if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
	if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
	if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
	if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
	if (x <= 0x7FFFFFFF) {n = n + 1;}
	return n;
#endif
}
#endif // psp_vfpu_clz32

// Fallback functions.
#ifndef PSP_VFPU_OWN_FALLBACKS
static inline float vfpu_sin_fallback(float x) {return sinf(1.57079633f*x);}
static inline float vfpu_cos_fallback(float x) {return cosf(1.57079633f*x);}
static inline float vfpu_sqrt_fallback(float x) {return sqrtf(x);}
static inline float vfpu_rsqrt_fallback(float x) {return 1.0f/sqrtf(x);}
static inline float vfpu_asin_fallback(float x) {return 0.636619772f*asinf(x);}
static inline float vfpu_exp2_fallback(float x) {return exp2f(x);}
static inline float vfpu_log2_fallback(float x) {return log2f(x);}
static inline float vfpu_rcp_fallback(float x) {return 1.0f/x;}
#endif

// Note: PSP sin/cos output only has 22 significant
// binary digits.
static inline uint32_t vfpu_sin_quantum(uint32_t x) {
	return x < 1u << 22?
		1u:
		1u << (32 - 22 - psp_vfpu_clz32(x));
}

static inline uint32_t vfpu_sin_truncate_bits(uint32_t x) {
	return x & -vfpu_sin_quantum(x);
}

static inline uint32_t vfpu_sin_fixed(uint32_t arg) {
	// Handle endpoints.
	if(arg == 0u) return 0u;
	if(arg == 0x00800000) return 0x10000000;
	// Get endpoints for 8192-wide interval.
	uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0];
	uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1];
	// Approximate endpoints for 64-wide interval via lerp.
	uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7);
	uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7);
	// Adjust endpoints from deltas, and increase working precision.
	uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A);
	uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B);
	// Compute approximation via lerp. Is off by at most 1 quantum.
	uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5);
	v=vfpu_sin_truncate_bits(v);
	// Look up exceptions via binary search.
	// Note: vfpu_sin_lut_interval_delta stores
	// deltas from interval estimation.
	uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u;
	uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u;
	while(lo < hi) {
		uint32_t m = (lo + hi) / 2;
		// Note: vfpu_sin_lut_exceptions stores
		// index&127 (for each initial interval the
		// upper bits of index are the same, namely
		// arg&-128), plus direction (0 for +1, and
		// 128 for -1).
		uint32_t b = vfpu_sin_lut_exceptions[m];
		uint32_t e = (arg & -128u)+(b & 127u);
		if(e == arg) {
			v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u);
			break;
		}
		else if(e < arg) lo = m + 1;
		else			 hi = m;
	}
	return v;
}

float vfpu_sin(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut8192,              4100)&&
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_delta,          262144)&&
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_exceptions,      86938);
	if (!loaded)
		return vfpu_sin_fallback(x);
	uint32_t bits;
	memcpy(&bits, &x, sizeof(x));
	uint32_t sign = bits & 0x80000000u;
	uint32_t exponent = (bits >> 23) & 0xFFu;
	uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
	if(exponent == 0xFFu) {
		// NOTE: this bitpattern is a signaling
		// NaN on x86, so maybe just return
		// a normal qNaN?
		float y;
		bits=sign ^ 0x7F800001u;
		memcpy(&y, &bits, sizeof(y));
		return y;
	}
	if(exponent < 0x7Fu) {
		if(exponent < 0x7Fu-23u) significand = 0u;
		else significand >>= (0x7F - exponent);
	}
	else if(exponent > 0x7Fu) {
		// There is weirdness for large exponents.
		if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
		else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
		else significand <<= ((exponent - 0x7Fu) & 31);
	}
	sign ^= ((significand << 7) & 0x80000000u);
	significand &= 0x00FFFFFFu;
	if(significand > 0x00800000u) significand = 0x01000000u - significand;
	uint32_t ret = vfpu_sin_fixed(significand);
	return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
}

float vfpu_cos(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut8192,              4100)&&
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_delta,          262144)&&
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
		PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_exceptions,      86938);
	if (!loaded)
		return vfpu_cos_fallback(x);
	uint32_t bits;
	memcpy(&bits, &x, sizeof(x));
	bits &= 0x7FFFFFFFu;
	uint32_t sign = 0u;
	uint32_t exponent = (bits >> 23) & 0xFFu;
	uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
	if(exponent == 0xFFu) {
		// NOTE: this bitpattern is a signaling
		// NaN on x86, so maybe just return
		// a normal qNaN?
		float y;
		bits = sign ^ 0x7F800001u;
		memcpy(&y, &bits, sizeof(y));
		return y;
	}
	if(exponent < 0x7Fu) {
		if(exponent < 0x7Fu - 23u) significand = 0u;
		else significand >>= (0x7F - exponent);
	}
	else if(exponent > 0x7Fu) {
		// There is weirdness for large exponents.
		if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
		else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
		else significand <<= ((exponent - 0x7Fu) & 31);
	}
	sign ^= ((significand << 7) & 0x80000000u);
	significand &= 0x00FFFFFFu;
	if(significand >= 0x00800000u) {
		significand = 0x01000000u - significand;
		sign ^= 0x80000000u;
	}
	uint32_t ret = vfpu_sin_fixed(0x00800000u - significand);
	return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
}

void vfpu_sincos(float a, float &s, float &c) {
	// Just invoke both sin and cos.
	// Suboptimal but whatever.
	s = vfpu_sin(a);
	c = vfpu_cos(a);
}

// Integer square root of 2^23*x (rounded to zero).
// Input is in 2^23 <= x < 2^25, and representable in float.
static inline uint32_t isqrt23(uint32_t x) {
#if 0
	// Reference version.
	int dir=fesetround(FE_TOWARDZERO);
	uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f)));
	fesetround(dir);
	return ret;
#elif 1
	// Double version.
	// Verified to produce correct result for all valid inputs,
	// in all rounding modes, both in double and double-extended (x87)
	// precision.
	// Requires correctly-rounded sqrt (which on any IEEE-754 system
	// it should be).
	return uint32_t(int32_t(sqrt(double(x) * 8388608.0)));
#else
	// Pure integer version, if you don't like floating point.
	// Based on code from Hacker's Delight. See isqrt4 in
	// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
	// Relatively slow.
	uint64_t t=uint64_t(x) << 23, m, y, b;
	m=0x4000000000000000ull;
	y=0;
	while(m != 0) // Do 32 times.
	{
		b=y | m;
		y=y >> 1;
		if(t >= b)
		{
			t = t - b;
			y = y | m;
		}
		m = m >> 2;
	}
	return y;
#endif
}

// Returns floating-point bitpattern.
static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
	// Endpoints of input.
	uint32_t lo  =(x +  0u) & -64u;
	uint32_t hi = (x + 64u) & -64u;
	// Convert input to 9.23 fixed-point.
	lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo);
	hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi);
	// Estimate endpoints of output.
	uint32_t A = 0x3F000000u + isqrt23(lo);
	uint32_t B = 0x3F000000u + isqrt23(hi);
	// Apply deltas, and increase the working precision.
	uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]);
	uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]);
	uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
	// Truncate lower 2 bits.
	ret &= -4u;
	return ret;
}

float vfpu_sqrt(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_sqrt_lut, 262144);
	if (!loaded)
		return vfpu_sqrt_fallback(x);
	uint32_t bits;
	memcpy(&bits, &x, sizeof(bits));
	if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
		// Denormals (and zeroes) get +0, regardless
		// of sign.
		return +0.0f;
	}
	if(bits >> 31) {
		// Other negatives get NaN.
		bits = 0x7F800001u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if((bits >> 23) == 255u) {
		// Inf/NaN gets Inf/NaN.
		bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u);
		memcpy(&x, &bits, sizeof(bits));
		return x;
	}
	int32_t exponent = int32_t(bits >> 23) - 127;
	// Bottom bit of exponent (inverted) + significand (except bottom bit).
	uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
	bits = vfpu_sqrt_fixed(index);
	bits += uint32_t(exponent >> 1) << 23;
	memcpy(&x, &bits, sizeof(bits));
	return x;
}

// Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24.
static inline uint32_t rsqrt_floor22(uint32_t x) {
#if 1
	// Verified correct in all rounding directions,
	// by exhaustive search.
	return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p33
#else
	// Pure integer version, if you don't like floating point.
	// Based on code from Hacker's Delight. See isqrt4 in
	// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
	// Relatively slow.
	uint64_t t=uint64_t(x) << 22, m, y, b;
	m = 0x4000000000000000ull;
	y = 0;
	while(m != 0) // Do 32 times.
	{
		b = y | m;
		y = y >> 1;
		if(t >= b)
		{
			t = t - b;
			y = y | m;
		}
		m = m >> 2;
	}
	y = (1ull << 44) / y;
	// Decrement if y > floor(2^33 / sqrt(x)).
	// This hack works because exhaustive
	// search (on [2^22;2^24]) says it does.
	if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y;
	return uint32_t(y);
#endif
}

// Returns floating-point bitpattern.
static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
	// Endpoints of input.
	uint32_t lo = (x +  0u) & -64u;
	uint32_t hi = (x + 64u) & -64u;
	// Convert input to 10.22 fixed-point.
	lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo);
	hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi);
	// Estimate endpoints of output.
	uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo);
	uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi);
	// Apply deltas, and increase the working precision.
	uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]);
	uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]);
	// Evaluate via lerp.
	uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
	// Truncate lower 2 bits.
	ret &= -4u;
	return ret;
}

float vfpu_rsqrt(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_rsqrt_lut, 262144);
	if (!loaded)
		return vfpu_rsqrt_fallback(x);
	uint32_t bits;
	memcpy(&bits, &x, sizeof(bits));
	if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
		// Denormals (and zeroes) get inf of the same sign.
		bits = 0x7F800000u | (bits & 0x80000000u);
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if(bits >> 31) {
		// Other negatives get negative NaN.
		bits = 0xFF800001u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if((bits >> 23) == 255u) {
		// inf gets 0, NaN gets NaN.
		bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u);
		memcpy(&x, &bits, sizeof(bits));
		return x;
	}
	int32_t exponent = int32_t(bits >> 23) - 127;
	// Bottom bit of exponent (inverted) + significand (except bottom bit).
	uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
	bits = vfpu_rsqrt_fixed(index);
	bits -= uint32_t(exponent >> 1) << 23;
	memcpy(&x, &bits, sizeof(bits));
	return x;
}

static inline uint32_t vfpu_asin_quantum(uint32_t x) {
	return x<1u<<23?
		1u:
		1u<<(32-23-psp_vfpu_clz32(x));
}

static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) {
	return x & -vfpu_asin_quantum(x);
}

// Input is fixed 9.23, output is fixed 2.30.
static inline uint32_t vfpu_asin_approx(uint32_t x) {
	const int32_t *C = vfpu_asin_lut65536[x >> 16];
	x &= 0xFFFFu;
	return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0]));
}

// Input is fixed 9.23, output is fixed 2.30.
static uint32_t vfpu_asin_fixed(uint32_t x) {
	if(x == 0u) return 0u;
	if(x == 1u << 23) return 1u << 30;
	uint32_t ret = vfpu_asin_approx(x);
	uint32_t index = vfpu_asin_lut_indices[x / 21u];
	uint64_t deltas = vfpu_asin_lut_deltas[index];
	return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret);
}

float vfpu_asin(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_asin_lut65536,      1536)&&
		PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
		PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_deltas,  517448);
	if (!loaded)
		return vfpu_asin_fallback(x);

	uint32_t bits;
	memcpy(&bits, &x, sizeof(x));
	uint32_t sign = bits & 0x80000000u;
	bits = bits & 0x7FFFFFFFu;
	if(bits > 0x3F800000u) {
		bits = 0x7F800001u ^ sign;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}

	bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p23
	x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-30
	if(sign) x = -x;
	return x;
}

static inline uint32_t vfpu_exp2_approx(uint32_t x) {
	if(x == 0x00800000u) return 0x00800000u;
	uint32_t a=vfpu_exp2_lut65536[x >> 16];
	x &= 0x0000FFFFu;
	uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46));
	return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u;
}

static inline uint32_t vfpu_exp2_fixed(uint32_t x) {
	if(x == 0u) return 0u;
	if(x == 0x00800000u) return 0x00800000u;
	uint32_t A = vfpu_exp2_approx((x     ) & -64u);
	uint32_t B = vfpu_exp2_approx((x + 64) & -64u);
	uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u;
	uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u;
	uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
	y &= -4u;
	return y;
}

float vfpu_exp2(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut65536,    512)&&
		PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut,      262144);
	if (!loaded)
		return vfpu_exp2_fallback(x);
	int32_t bits;
	memcpy(&bits, &x, sizeof(bits));
	if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
		// Denormals are treated as 0.
		return 1.0f;
	}
	if(x != x) {
		// NaN gets NaN.
		bits = 0x7F800001u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if(x <= -126.0f) {
		// Small numbers get 0 (exp2(-126) is smallest positive non-denormal).
		// But yes, -126.0f produces +0.0f.
		return 0.0f;
	}
	if(x >= +128.0f) {
		// Large numbers get infinity.
		bits = 0x7F800000u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	bits = int32_t(x * 0x1p23f);
	if(x < 0.0f) --bits; // Yes, really.
	bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF)));
	memcpy(&x, &bits, sizeof(bits));
	return x;
}

float vfpu_rexp2(float x) {
	return vfpu_exp2(-x);
}

// Input fixed 9.23, output fixed 10.22.
// Returns log2(1+x).
static inline uint32_t vfpu_log2_approx(uint32_t x) {
	uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0];
	uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1];
	uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16];
	x &= 0xFFFFu;
	uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
	uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
	ret += d;
	return uint32_t(ret >> 16);
}

// Matches PSP output on all known values.
float vfpu_log2(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536,               516)&&
		PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536_quadratic,     512)&&
		PSP_VFPU_LOAD_TABLE(vfpu_log2_lut,                2097152);
	if (!loaded)
		return vfpu_log2_fallback(x);
	uint32_t bits;
	memcpy(&bits, &x, sizeof(bits));
	if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
		// Denormals (and zeroes) get -inf.
		bits = 0xFF800000u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if(bits & 0x80000000u) {
		// Other negatives get NaN.
		bits = 0x7F800001u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if((bits >> 23) == 255u) {
		// NaN gets NaN, +inf gets +inf.
		bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0);
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	uint32_t e = (bits & 0x7F800000u) - 0x3F800000u;
	uint32_t i = bits & 0x007FFFFFu;
	if(e >> 31 && i >= 0x007FFE00u) {
		// Process 1-2^{-14}<=x*2^n<1 (for n>0) separately,
		// since the table doesn't give the right answer.
		float c = float(int32_t(~e) >> 23);
		// Note: if c is 0 the sign of -0 output is correct.
		return i < 0x007FFEF7u ? // 1-265*2^{-24}
			-3.05175781e-05f - c:
			-0.0f - c;
	}
	int d = (e < 0x01000000u ? 0 : 8 - psp_vfpu_clz32(e) - int(e >> 31));
	//assert(d >= 0 && d < 8);
	uint32_t q = 1u << d;
	uint32_t A = vfpu_log2_approx((i     ) & -64u) & -q;
	uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q;
	uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q;
	uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q;
	uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6);
	v &= -q;
	bits = e ^ (2u * v);
	x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f
	return x;
}

static inline uint32_t vfpu_rcp_approx(uint32_t i) {
	return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u);
}

float vfpu_rcp(float x) {
	static bool loaded =
		PSP_VFPU_LOAD_TABLE(vfpu_rcp_lut, 262144);
	if (!loaded)
		return vfpu_rcp_fallback(x);
	uint32_t bits;
	memcpy(&bits, &x, sizeof(bits));
	uint32_t s = bits & 0x80000000u;
	uint32_t e = bits & 0x7F800000u;
	uint32_t i = bits & 0x007FFFFFu;
	if((bits & 0x7FFFFFFFu) > 0x7E800000u) {
		bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s);
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	if(e==0u) {
		bits = s^0x7F800000u;
		memcpy(&x, &bits, sizeof(x));
		return x;
	}
	uint32_t A = vfpu_rcp_approx((i	 ) & -64u);
	uint32_t B = vfpu_rcp_approx((i + 64) & -64u);
	uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u;
	uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u;
	uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6);
	v &= -4u;
	bits = s + (0x3F800000u - e) + v;
	memcpy(&x, &bits, sizeof(x));
	return x;
}

//==============================================================================

void psp_vfpu_init(void) {
	// Load all in advance.
	PSP_VFPU_LOAD_TABLE(vfpu_asin_lut65536          ,    1536);
	PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_deltas        ,  517448);
	PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_indices       ,  798916);
	PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut65536          ,     512);
	PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut               ,  262144);
	PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536          ,     516);
	PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536_quadratic,     512);
	PSP_VFPU_LOAD_TABLE(vfpu_log2_lut               , 2097152);
	PSP_VFPU_LOAD_TABLE(vfpu_rcp_lut                ,  262144);
	PSP_VFPU_LOAD_TABLE(vfpu_rsqrt_lut              ,  262144);
	PSP_VFPU_LOAD_TABLE(vfpu_sin_lut8192            ,    4100);
	PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_delta          ,  262144);
	PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_exceptions     ,   86938);
	PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_interval_delta ,  131074);
	PSP_VFPU_LOAD_TABLE(vfpu_sqrt_lut               ,  262144);
}

#endif // PSP_VFPU_IMPLEMENTATION
No results found