Created
February 15, 2026 19:01
-
-
Save fp64/5ab5822a9d35d1e9b8d9fb24b82a8487 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Copyright (c) 2012- PPSSPP Project. | |
| // This program is free software: you can redistribute it and/or modify | |
| // it under the terms of the GNU General Public License as published by | |
| // the Free Software Foundation, version 2.0 or later versions. | |
| // This program is distributed in the hope that it will be useful, | |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| // GNU General Public License 2.0 for more details. | |
| //============================================================================== | |
| // NOTE: adapted from Core/MIPS/MIPSVFPUUtils.cpp | |
| // in PPSSPP codebase (https://github.com/hrydgard/ppsspp), | |
| // used with permission (https://github.com/hrydgard/ppsspp/issues/21070#issuecomment-3899222722). | |
| // psp_vfpu.h - software implementations of several | |
| // PSP vfpu functions, assumed to be bitwise-exact | |
| // to PSP hardware. | |
| // NOTE: beside this file you also need data tables from | |
| // https://github.com/hrydgard/ppsspp/tree/master/assets/vfpu | |
| // Currently the code only compiles as C++, though | |
| // converting to C should be doable. | |
| /* | |
| // Usage example. | |
| // Instantiate an implementation (in exactly ONE translation unit): | |
| #define PSP_VFPU_IMPLEMENTATION | |
| #include "psp_vfpu.h" | |
| #include <stdio.h> | |
| int main() | |
| { | |
| printf("%.9g\n",vfpu_sin(0.5f)); // prints 0.70710659. | |
| return 0; | |
| } | |
| */ | |
| //============================================================================== | |
| // Header section. | |
| #ifndef PSP_VFPU_H | |
| #define PSP_VFPU_H | |
| #ifdef PSP_VFPU_STATIC | |
| #define PSP_VFPU_DEF static | |
| #else | |
| #define PSP_VFPU_DEF extern | |
| #endif | |
| #include <stdint.h> | |
| #ifdef __cplusplus | |
| extern "C" { | |
| #endif | |
| PSP_VFPU_DEF void psp_vfpu_init(void); | |
| PSP_VFPU_DEF void vrnd_init_default(uint32_t *rcx); | |
| PSP_VFPU_DEF void vrnd_init(uint32_t seed, uint32_t *rcx); | |
| PSP_VFPU_DEF uint32_t vrnd_generate(uint32_t *rcx); | |
| PSP_VFPU_DEF float vfpu_sin(float x); | |
| PSP_VFPU_DEF float vfpu_cos(float x); | |
| PSP_VFPU_DEF void vfpu_sincos(float a, float &s, float &c); | |
| PSP_VFPU_DEF float vfpu_sqrt(float x); | |
| PSP_VFPU_DEF float vfpu_rsqrt(float x); | |
| PSP_VFPU_DEF float vfpu_asin(float x); | |
| PSP_VFPU_DEF float vfpu_exp2(float x); | |
| PSP_VFPU_DEF float vfpu_rexp2(float x); | |
| PSP_VFPU_DEF float vfpu_log2(float x); | |
| PSP_VFPU_DEF float vfpu_rcp(float x); | |
| #ifdef __cplusplus | |
| } | |
| #endif | |
| #endif // PSP_VFPU_H | |
| //============================================================================== | |
| // Implementation section. | |
| #ifdef PSP_VFPU_IMPLEMENTATION | |
| #include <string.h> // for memcpy() | |
| #include <math.h> // for sqrt(double) | |
| //============================================================================== | |
| // The code below attempts to exactly match behaviour of | |
| // PSP's vrnd instructions. See investigation starting around | |
| // https://github.com/hrydgard/ppsspp/issues/16946#issuecomment-1467261209 | |
| // for details. | |
| // Redundant currently, since MIPSState::Init() already | |
| // does this on its own, but left as-is to be self-contained. | |
| void vrnd_init_default(uint32_t *rcx) { | |
| rcx[0] = 0x00000001; | |
| rcx[1] = 0x00000002; | |
| rcx[2] = 0x00000004; | |
| rcx[3] = 0x00000008; | |
| rcx[4] = 0x00000000; | |
| rcx[5] = 0x00000000; | |
| rcx[6] = 0x00000000; | |
| rcx[7] = 0x00000000; | |
| } | |
| void vrnd_init(uint32_t seed, uint32_t *rcx) { | |
| for(int i = 0; i < 8; ++i) rcx[i] = | |
| 0x3F800000u | // 1.0f mask. | |
| ((seed >> ((i / 4) * 16)) & 0xFFFFu) | // lower or upper half of the seed. | |
| (((seed >> (4 * i)) & 0xF) << 16); // remaining nibble. | |
| } | |
| uint32_t vrnd_generate(uint32_t *rcx) { | |
| // The actual RNG state appears to be 5 parts | |
| // (32-bit each) stored into the registers as follows: | |
| uint32_t A = (rcx[0] & 0xFFFFu) | (rcx[4] << 16); | |
| uint32_t B = (rcx[1] & 0xFFFFu) | (rcx[5] << 16); | |
| uint32_t C = (rcx[2] & 0xFFFFu) | (rcx[6] << 16); | |
| uint32_t D = (rcx[3] & 0xFFFFu) | (rcx[7] << 16); | |
| uint32_t E = (((rcx[0] >> 16) & 0xF) << 0) | | |
| (((rcx[1] >> 16) & 0xF) << 4) | | |
| (((rcx[2] >> 16) & 0xF) << 8) | | |
| (((rcx[3] >> 16) & 0xF) << 12) | | |
| (((rcx[4] >> 16) & 0xF) << 16) | | |
| (((rcx[5] >> 16) & 0xF) << 20) | | |
| (((rcx[6] >> 16) & 0xF) << 24) | | |
| (((rcx[7] >> 16) & 0xF) << 28); | |
| // Update. | |
| // LCG with classic parameters. | |
| A = 69069u * A + 1u; // NOTE: decimal constants. | |
| // Xorshift, with classic parameters. Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs". | |
| B ^= B << 13; | |
| B ^= B >> 17; | |
| B ^= B << 5; | |
| // Sequence similar to Pell numbers ( https://en.wikipedia.org/wiki/Pell_number ), | |
| // except with different starting values, and an occasional increment (E). | |
| uint32_t t= 2u * D + C + E; | |
| // NOTE: the details of how E-part is set are somewhat of a guess | |
| // at the moment. The expression below looks weird, but does match | |
| // the available test data. | |
| E = uint32_t((uint64_t(C) + uint64_t(D >> 1) + uint64_t(E)) >> 32); | |
| C = D; | |
| D = t; | |
| // Store. | |
| rcx[0] = 0x3F800000u | (((E >> 0) & 0xF) << 16) | (A & 0xFFFFu); | |
| rcx[1] = 0x3F800000u | (((E >> 4) & 0xF) << 16) | (B & 0xFFFFu); | |
| rcx[2] = 0x3F800000u | (((E >> 8) & 0xF) << 16) | (C & 0xFFFFu); | |
| rcx[3] = 0x3F800000u | (((E >> 12) & 0xF) << 16) | (D & 0xFFFFu); | |
| rcx[4] = 0x3F800000u | (((E >> 16) & 0xF) << 16) | (A >> 16); | |
| rcx[5] = 0x3F800000u | (((E >> 20) & 0xF) << 16) | (B >> 16); | |
| rcx[6] = 0x3F800000u | (((E >> 24) & 0xF) << 16) | (C >> 16); | |
| rcx[7] = 0x3F800000u | (((E >> 28) & 0xF) << 16) | (D >> 16); | |
| // Return value. | |
| return A + B + D; | |
| } | |
| //============================================================================== | |
| // The code below attempts to exactly match the output of | |
| // several PSP's VFPU functions. For the sake of | |
| // making lookup tables smaller the code is | |
| // somewhat gnarly. | |
| // Lookup tables sometimes store deltas from (explicitly computable) | |
| // estimations, to allow to store them in smaller types. | |
| // See https://github.com/hrydgard/ppsspp/issues/16946 for details. | |
| // Lookup tables. | |
| // Note: these are never unloaded, and stay till program termination. | |
| static uint32_t *vfpu_sin_lut8192=nullptr; | |
| static int8_t (*vfpu_sin_lut_delta)[2]=nullptr; | |
| static int16_t *vfpu_sin_lut_interval_delta=nullptr; | |
| static uint8_t *vfpu_sin_lut_exceptions=nullptr; | |
| static int8_t (*vfpu_sqrt_lut)[2]=nullptr; | |
| static int8_t (*vfpu_rsqrt_lut)[2]=nullptr; | |
| static uint32_t *vfpu_exp2_lut65536=nullptr; | |
| static uint8_t (*vfpu_exp2_lut)[2]=nullptr; | |
| static uint32_t *vfpu_log2_lut65536=nullptr; | |
| static uint32_t *vfpu_log2_lut65536_quadratic=nullptr; | |
| static uint8_t (*vfpu_log2_lut)[131072][2]=nullptr; | |
| static int32_t (*vfpu_asin_lut65536)[3]=nullptr; | |
| static uint64_t *vfpu_asin_lut_deltas=nullptr; | |
| static uint16_t *vfpu_asin_lut_indices=nullptr; | |
| static int8_t (*vfpu_rcp_lut)[2]=nullptr; | |
| #ifndef PSP_VFPU_LOAD_TABLE | |
| #include <stdio.h> | |
| template<typename T> | |
| static inline bool psp_vfpu_load_table(T *&ptr, const char *filename, size_t expected_size) { | |
| if (ptr) return true; // Already loaded. | |
| fprintf(stderr, "Loading '%s'...",filename); | |
| if(expected_size % sizeof(T)) { | |
| fprintf(stderr, " FAIL (invalid expected size: %u)!\n", (unsigned)expected_size); | |
| return false; | |
| } | |
| FILE *f = fopen(filename, "rb"); | |
| if(!f) return false; | |
| ptr = new T[expected_size / sizeof(T)]; | |
| size_t size = fread(ptr, 1, expected_size, f); | |
| fclose(f); | |
| if (size != expected_size) { | |
| fprintf(stderr, " FAIL (size=%u, expected: %u)!\n", (unsigned)size, (unsigned)expected_size); | |
| delete[] ptr; | |
| ptr = nullptr; | |
| return false; | |
| } | |
| fprintf(stderr, " ok.\n"); | |
| return true; | |
| } | |
| #define PSP_VFPU_LOAD_TABLE(name, expected_size)\ | |
| psp_vfpu_load_table(name,#name ".dat",expected_size) | |
| #endif // PSP_VFPU_LOAD_TABLE | |
| #ifndef psp_vfpu_clz32 | |
| static int psp_vfpu_clz32(uint32_t x) | |
| { | |
| #ifdef __GNUC__ | |
| return x==0 ? 32 : __builtin_clz(x); | |
| #else | |
| // From Hacker's Delight. | |
| int n; | |
| if (x == 0) return(32); | |
| n = 0; | |
| if (x <= 0x0000FFFF) {n = n +16; x = x <<16;} | |
| if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;} | |
| if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;} | |
| if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;} | |
| if (x <= 0x7FFFFFFF) {n = n + 1;} | |
| return n; | |
| #endif | |
| } | |
| #endif // psp_vfpu_clz32 | |
| // Fallback functions. | |
| #ifndef PSP_VFPU_OWN_FALLBACKS | |
| static inline float vfpu_sin_fallback(float x) {return sinf(1.57079633f*x);} | |
| static inline float vfpu_cos_fallback(float x) {return cosf(1.57079633f*x);} | |
| static inline float vfpu_sqrt_fallback(float x) {return sqrtf(x);} | |
| static inline float vfpu_rsqrt_fallback(float x) {return 1.0f/sqrtf(x);} | |
| static inline float vfpu_asin_fallback(float x) {return 0.636619772f*asinf(x);} | |
| static inline float vfpu_exp2_fallback(float x) {return exp2f(x);} | |
| static inline float vfpu_log2_fallback(float x) {return log2f(x);} | |
| static inline float vfpu_rcp_fallback(float x) {return 1.0f/x;} | |
| #endif | |
| // Note: PSP sin/cos output only has 22 significant | |
| // binary digits. | |
| static inline uint32_t vfpu_sin_quantum(uint32_t x) { | |
| return x < 1u << 22? | |
| 1u: | |
| 1u << (32 - 22 - psp_vfpu_clz32(x)); | |
| } | |
| static inline uint32_t vfpu_sin_truncate_bits(uint32_t x) { | |
| return x & -vfpu_sin_quantum(x); | |
| } | |
| static inline uint32_t vfpu_sin_fixed(uint32_t arg) { | |
| // Handle endpoints. | |
| if(arg == 0u) return 0u; | |
| if(arg == 0x00800000) return 0x10000000; | |
| // Get endpoints for 8192-wide interval. | |
| uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0]; | |
| uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1]; | |
| // Approximate endpoints for 64-wide interval via lerp. | |
| uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7); | |
| uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7); | |
| // Adjust endpoints from deltas, and increase working precision. | |
| uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A); | |
| uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B); | |
| // Compute approximation via lerp. Is off by at most 1 quantum. | |
| uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5); | |
| v=vfpu_sin_truncate_bits(v); | |
| // Look up exceptions via binary search. | |
| // Note: vfpu_sin_lut_interval_delta stores | |
| // deltas from interval estimation. | |
| uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u; | |
| uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u; | |
| while(lo < hi) { | |
| uint32_t m = (lo + hi) / 2; | |
| // Note: vfpu_sin_lut_exceptions stores | |
| // index&127 (for each initial interval the | |
| // upper bits of index are the same, namely | |
| // arg&-128), plus direction (0 for +1, and | |
| // 128 for -1). | |
| uint32_t b = vfpu_sin_lut_exceptions[m]; | |
| uint32_t e = (arg & -128u)+(b & 127u); | |
| if(e == arg) { | |
| v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u); | |
| break; | |
| } | |
| else if(e < arg) lo = m + 1; | |
| else hi = m; | |
| } | |
| return v; | |
| } | |
| float vfpu_sin(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut8192, 4100)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_delta, 262144)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_exceptions, 86938); | |
| if (!loaded) | |
| return vfpu_sin_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(x)); | |
| uint32_t sign = bits & 0x80000000u; | |
| uint32_t exponent = (bits >> 23) & 0xFFu; | |
| uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u; | |
| if(exponent == 0xFFu) { | |
| // NOTE: this bitpattern is a signaling | |
| // NaN on x86, so maybe just return | |
| // a normal qNaN? | |
| float y; | |
| bits=sign ^ 0x7F800001u; | |
| memcpy(&y, &bits, sizeof(y)); | |
| return y; | |
| } | |
| if(exponent < 0x7Fu) { | |
| if(exponent < 0x7Fu-23u) significand = 0u; | |
| else significand >>= (0x7F - exponent); | |
| } | |
| else if(exponent > 0x7Fu) { | |
| // There is weirdness for large exponents. | |
| if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u; | |
| else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u; | |
| else significand <<= ((exponent - 0x7Fu) & 31); | |
| } | |
| sign ^= ((significand << 7) & 0x80000000u); | |
| significand &= 0x00FFFFFFu; | |
| if(significand > 0x00800000u) significand = 0x01000000u - significand; | |
| uint32_t ret = vfpu_sin_fixed(significand); | |
| return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f | |
| } | |
| float vfpu_cos(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut8192, 4100)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_delta, 262144)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_exceptions, 86938); | |
| if (!loaded) | |
| return vfpu_cos_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(x)); | |
| bits &= 0x7FFFFFFFu; | |
| uint32_t sign = 0u; | |
| uint32_t exponent = (bits >> 23) & 0xFFu; | |
| uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u; | |
| if(exponent == 0xFFu) { | |
| // NOTE: this bitpattern is a signaling | |
| // NaN on x86, so maybe just return | |
| // a normal qNaN? | |
| float y; | |
| bits = sign ^ 0x7F800001u; | |
| memcpy(&y, &bits, sizeof(y)); | |
| return y; | |
| } | |
| if(exponent < 0x7Fu) { | |
| if(exponent < 0x7Fu - 23u) significand = 0u; | |
| else significand >>= (0x7F - exponent); | |
| } | |
| else if(exponent > 0x7Fu) { | |
| // There is weirdness for large exponents. | |
| if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u; | |
| else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u; | |
| else significand <<= ((exponent - 0x7Fu) & 31); | |
| } | |
| sign ^= ((significand << 7) & 0x80000000u); | |
| significand &= 0x00FFFFFFu; | |
| if(significand >= 0x00800000u) { | |
| significand = 0x01000000u - significand; | |
| sign ^= 0x80000000u; | |
| } | |
| uint32_t ret = vfpu_sin_fixed(0x00800000u - significand); | |
| return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f | |
| } | |
| void vfpu_sincos(float a, float &s, float &c) { | |
| // Just invoke both sin and cos. | |
| // Suboptimal but whatever. | |
| s = vfpu_sin(a); | |
| c = vfpu_cos(a); | |
| } | |
| // Integer square root of 2^23*x (rounded to zero). | |
| // Input is in 2^23 <= x < 2^25, and representable in float. | |
| static inline uint32_t isqrt23(uint32_t x) { | |
| #if 0 | |
| // Reference version. | |
| int dir=fesetround(FE_TOWARDZERO); | |
| uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f))); | |
| fesetround(dir); | |
| return ret; | |
| #elif 1 | |
| // Double version. | |
| // Verified to produce correct result for all valid inputs, | |
| // in all rounding modes, both in double and double-extended (x87) | |
| // precision. | |
| // Requires correctly-rounded sqrt (which on any IEEE-754 system | |
| // it should be). | |
| return uint32_t(int32_t(sqrt(double(x) * 8388608.0))); | |
| #else | |
| // Pure integer version, if you don't like floating point. | |
| // Based on code from Hacker's Delight. See isqrt4 in | |
| // https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt | |
| // Relatively slow. | |
| uint64_t t=uint64_t(x) << 23, m, y, b; | |
| m=0x4000000000000000ull; | |
| y=0; | |
| while(m != 0) // Do 32 times. | |
| { | |
| b=y | m; | |
| y=y >> 1; | |
| if(t >= b) | |
| { | |
| t = t - b; | |
| y = y | m; | |
| } | |
| m = m >> 2; | |
| } | |
| return y; | |
| #endif | |
| } | |
| // Returns floating-point bitpattern. | |
| static inline uint32_t vfpu_sqrt_fixed(uint32_t x) { | |
| // Endpoints of input. | |
| uint32_t lo =(x + 0u) & -64u; | |
| uint32_t hi = (x + 64u) & -64u; | |
| // Convert input to 9.23 fixed-point. | |
| lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo); | |
| hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi); | |
| // Estimate endpoints of output. | |
| uint32_t A = 0x3F000000u + isqrt23(lo); | |
| uint32_t B = 0x3F000000u + isqrt23(hi); | |
| // Apply deltas, and increase the working precision. | |
| uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]); | |
| uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]); | |
| uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4); | |
| // Truncate lower 2 bits. | |
| ret &= -4u; | |
| return ret; | |
| } | |
| float vfpu_sqrt(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_sqrt_lut, 262144); | |
| if (!loaded) | |
| return vfpu_sqrt_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(bits)); | |
| if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) { | |
| // Denormals (and zeroes) get +0, regardless | |
| // of sign. | |
| return +0.0f; | |
| } | |
| if(bits >> 31) { | |
| // Other negatives get NaN. | |
| bits = 0x7F800001u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if((bits >> 23) == 255u) { | |
| // Inf/NaN gets Inf/NaN. | |
| bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u); | |
| memcpy(&x, &bits, sizeof(bits)); | |
| return x; | |
| } | |
| int32_t exponent = int32_t(bits >> 23) - 127; | |
| // Bottom bit of exponent (inverted) + significand (except bottom bit). | |
| uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu; | |
| bits = vfpu_sqrt_fixed(index); | |
| bits += uint32_t(exponent >> 1) << 23; | |
| memcpy(&x, &bits, sizeof(bits)); | |
| return x; | |
| } | |
| // Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24. | |
| static inline uint32_t rsqrt_floor22(uint32_t x) { | |
| #if 1 | |
| // Verified correct in all rounding directions, | |
| // by exhaustive search. | |
| return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p33 | |
| #else | |
| // Pure integer version, if you don't like floating point. | |
| // Based on code from Hacker's Delight. See isqrt4 in | |
| // https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt | |
| // Relatively slow. | |
| uint64_t t=uint64_t(x) << 22, m, y, b; | |
| m = 0x4000000000000000ull; | |
| y = 0; | |
| while(m != 0) // Do 32 times. | |
| { | |
| b = y | m; | |
| y = y >> 1; | |
| if(t >= b) | |
| { | |
| t = t - b; | |
| y = y | m; | |
| } | |
| m = m >> 2; | |
| } | |
| y = (1ull << 44) / y; | |
| // Decrement if y > floor(2^33 / sqrt(x)). | |
| // This hack works because exhaustive | |
| // search (on [2^22;2^24]) says it does. | |
| if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y; | |
| return uint32_t(y); | |
| #endif | |
| } | |
| // Returns floating-point bitpattern. | |
| static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) { | |
| // Endpoints of input. | |
| uint32_t lo = (x + 0u) & -64u; | |
| uint32_t hi = (x + 64u) & -64u; | |
| // Convert input to 10.22 fixed-point. | |
| lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo); | |
| hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi); | |
| // Estimate endpoints of output. | |
| uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo); | |
| uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi); | |
| // Apply deltas, and increase the working precision. | |
| uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]); | |
| uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]); | |
| // Evaluate via lerp. | |
| uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4); | |
| // Truncate lower 2 bits. | |
| ret &= -4u; | |
| return ret; | |
| } | |
| float vfpu_rsqrt(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_rsqrt_lut, 262144); | |
| if (!loaded) | |
| return vfpu_rsqrt_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(bits)); | |
| if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) { | |
| // Denormals (and zeroes) get inf of the same sign. | |
| bits = 0x7F800000u | (bits & 0x80000000u); | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if(bits >> 31) { | |
| // Other negatives get negative NaN. | |
| bits = 0xFF800001u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if((bits >> 23) == 255u) { | |
| // inf gets 0, NaN gets NaN. | |
| bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u); | |
| memcpy(&x, &bits, sizeof(bits)); | |
| return x; | |
| } | |
| int32_t exponent = int32_t(bits >> 23) - 127; | |
| // Bottom bit of exponent (inverted) + significand (except bottom bit). | |
| uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu; | |
| bits = vfpu_rsqrt_fixed(index); | |
| bits -= uint32_t(exponent >> 1) << 23; | |
| memcpy(&x, &bits, sizeof(bits)); | |
| return x; | |
| } | |
| static inline uint32_t vfpu_asin_quantum(uint32_t x) { | |
| return x<1u<<23? | |
| 1u: | |
| 1u<<(32-23-psp_vfpu_clz32(x)); | |
| } | |
| static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) { | |
| return x & -vfpu_asin_quantum(x); | |
| } | |
| // Input is fixed 9.23, output is fixed 2.30. | |
| static inline uint32_t vfpu_asin_approx(uint32_t x) { | |
| const int32_t *C = vfpu_asin_lut65536[x >> 16]; | |
| x &= 0xFFFFu; | |
| return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0])); | |
| } | |
| // Input is fixed 9.23, output is fixed 2.30. | |
| static uint32_t vfpu_asin_fixed(uint32_t x) { | |
| if(x == 0u) return 0u; | |
| if(x == 1u << 23) return 1u << 30; | |
| uint32_t ret = vfpu_asin_approx(x); | |
| uint32_t index = vfpu_asin_lut_indices[x / 21u]; | |
| uint64_t deltas = vfpu_asin_lut_deltas[index]; | |
| return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret); | |
| } | |
| float vfpu_asin(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_asin_lut65536, 1536)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_indices, 798916)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_deltas, 517448); | |
| if (!loaded) | |
| return vfpu_asin_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(x)); | |
| uint32_t sign = bits & 0x80000000u; | |
| bits = bits & 0x7FFFFFFFu; | |
| if(bits > 0x3F800000u) { | |
| bits = 0x7F800001u ^ sign; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p23 | |
| x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-30 | |
| if(sign) x = -x; | |
| return x; | |
| } | |
| static inline uint32_t vfpu_exp2_approx(uint32_t x) { | |
| if(x == 0x00800000u) return 0x00800000u; | |
| uint32_t a=vfpu_exp2_lut65536[x >> 16]; | |
| x &= 0x0000FFFFu; | |
| uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46)); | |
| return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u; | |
| } | |
| static inline uint32_t vfpu_exp2_fixed(uint32_t x) { | |
| if(x == 0u) return 0u; | |
| if(x == 0x00800000u) return 0x00800000u; | |
| uint32_t A = vfpu_exp2_approx((x ) & -64u); | |
| uint32_t B = vfpu_exp2_approx((x + 64) & -64u); | |
| uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u; | |
| uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u; | |
| uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4); | |
| y &= -4u; | |
| return y; | |
| } | |
| float vfpu_exp2(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut65536, 512)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut, 262144); | |
| if (!loaded) | |
| return vfpu_exp2_fallback(x); | |
| int32_t bits; | |
| memcpy(&bits, &x, sizeof(bits)); | |
| if((bits & 0x7FFFFFFF) <= 0x007FFFFF) { | |
| // Denormals are treated as 0. | |
| return 1.0f; | |
| } | |
| if(x != x) { | |
| // NaN gets NaN. | |
| bits = 0x7F800001u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if(x <= -126.0f) { | |
| // Small numbers get 0 (exp2(-126) is smallest positive non-denormal). | |
| // But yes, -126.0f produces +0.0f. | |
| return 0.0f; | |
| } | |
| if(x >= +128.0f) { | |
| // Large numbers get infinity. | |
| bits = 0x7F800000u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| bits = int32_t(x * 0x1p23f); | |
| if(x < 0.0f) --bits; // Yes, really. | |
| bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF))); | |
| memcpy(&x, &bits, sizeof(bits)); | |
| return x; | |
| } | |
| float vfpu_rexp2(float x) { | |
| return vfpu_exp2(-x); | |
| } | |
| // Input fixed 9.23, output fixed 10.22. | |
| // Returns log2(1+x). | |
| static inline uint32_t vfpu_log2_approx(uint32_t x) { | |
| uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0]; | |
| uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1]; | |
| uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16]; | |
| x &= 0xFFFFu; | |
| uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x; | |
| uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40; | |
| ret += d; | |
| return uint32_t(ret >> 16); | |
| } | |
| // Matches PSP output on all known values. | |
| float vfpu_log2(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536, 516)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&& | |
| PSP_VFPU_LOAD_TABLE(vfpu_log2_lut, 2097152); | |
| if (!loaded) | |
| return vfpu_log2_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(bits)); | |
| if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) { | |
| // Denormals (and zeroes) get -inf. | |
| bits = 0xFF800000u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if(bits & 0x80000000u) { | |
| // Other negatives get NaN. | |
| bits = 0x7F800001u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if((bits >> 23) == 255u) { | |
| // NaN gets NaN, +inf gets +inf. | |
| bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0); | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| uint32_t e = (bits & 0x7F800000u) - 0x3F800000u; | |
| uint32_t i = bits & 0x007FFFFFu; | |
| if(e >> 31 && i >= 0x007FFE00u) { | |
| // Process 1-2^{-14}<=x*2^n<1 (for n>0) separately, | |
| // since the table doesn't give the right answer. | |
| float c = float(int32_t(~e) >> 23); | |
| // Note: if c is 0 the sign of -0 output is correct. | |
| return i < 0x007FFEF7u ? // 1-265*2^{-24} | |
| -3.05175781e-05f - c: | |
| -0.0f - c; | |
| } | |
| int d = (e < 0x01000000u ? 0 : 8 - psp_vfpu_clz32(e) - int(e >> 31)); | |
| //assert(d >= 0 && d < 8); | |
| uint32_t q = 1u << d; | |
| uint32_t A = vfpu_log2_approx((i ) & -64u) & -q; | |
| uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q; | |
| uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q; | |
| uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q; | |
| uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6); | |
| v &= -q; | |
| bits = e ^ (2u * v); | |
| x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f | |
| return x; | |
| } | |
| static inline uint32_t vfpu_rcp_approx(uint32_t i) { | |
| return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u); | |
| } | |
| float vfpu_rcp(float x) { | |
| static bool loaded = | |
| PSP_VFPU_LOAD_TABLE(vfpu_rcp_lut, 262144); | |
| if (!loaded) | |
| return vfpu_rcp_fallback(x); | |
| uint32_t bits; | |
| memcpy(&bits, &x, sizeof(bits)); | |
| uint32_t s = bits & 0x80000000u; | |
| uint32_t e = bits & 0x7F800000u; | |
| uint32_t i = bits & 0x007FFFFFu; | |
| if((bits & 0x7FFFFFFFu) > 0x7E800000u) { | |
| bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s); | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| if(e==0u) { | |
| bits = s^0x7F800000u; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| uint32_t A = vfpu_rcp_approx((i ) & -64u); | |
| uint32_t B = vfpu_rcp_approx((i + 64) & -64u); | |
| uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u; | |
| uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u; | |
| uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6); | |
| v &= -4u; | |
| bits = s + (0x3F800000u - e) + v; | |
| memcpy(&x, &bits, sizeof(x)); | |
| return x; | |
| } | |
| //============================================================================== | |
| void psp_vfpu_init(void) { | |
| // Load all in advance. | |
| PSP_VFPU_LOAD_TABLE(vfpu_asin_lut65536 , 1536); | |
| PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_deltas , 517448); | |
| PSP_VFPU_LOAD_TABLE(vfpu_asin_lut_indices , 798916); | |
| PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut65536 , 512); | |
| PSP_VFPU_LOAD_TABLE(vfpu_exp2_lut , 262144); | |
| PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536 , 516); | |
| PSP_VFPU_LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512); | |
| PSP_VFPU_LOAD_TABLE(vfpu_log2_lut , 2097152); | |
| PSP_VFPU_LOAD_TABLE(vfpu_rcp_lut , 262144); | |
| PSP_VFPU_LOAD_TABLE(vfpu_rsqrt_lut , 262144); | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut8192 , 4100); | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_delta , 262144); | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_exceptions , 86938); | |
| PSP_VFPU_LOAD_TABLE(vfpu_sin_lut_interval_delta , 131074); | |
| PSP_VFPU_LOAD_TABLE(vfpu_sqrt_lut , 262144); | |
| } | |
| #endif // PSP_VFPU_IMPLEMENTATION |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment