Skip to content

Instantly share code, notes, and snippets.

@diaphragm-workplace
Created September 21, 2024 21:27
Show Gist options
  • Select an option

  • Save diaphragm-workplace/6b73bb65b888d72c966839d43f295a3f to your computer and use it in GitHub Desktop.

Select an option

Save diaphragm-workplace/6b73bb65b888d72c966839d43f295a3f to your computer and use it in GitHub Desktop.
float16 golang test
package main
/*
#cgo CFLAGS: -march=native -mtune=native -Ofast -flto
#cgo LDFLAGS: -march=native -mtune=native -Ofast
#include <stdint.h>
// Optimized out into a single vector instruction on x86 compatibles and aarch64
void float32_to_float16(
float a, float b, float c, float d,
float e, float f, float g, float h,
uint16_t* res_a, uint16_t* res_b, uint16_t* res_c, uint16_t* res_d,
uint16_t* res_e, uint16_t* res_f, uint16_t* res_g, uint16_t* res_h)
{
_Float16 tmp_a = (_Float16)a;
_Float16 tmp_b = (_Float16)b;
_Float16 tmp_c = (_Float16)c;
_Float16 tmp_d = (_Float16)d;
_Float16 tmp_e = (_Float16)e;
_Float16 tmp_f = (_Float16)f;
_Float16 tmp_g = (_Float16)g;
_Float16 tmp_h = (_Float16)h;
*res_a = *(uint16_t*)&tmp_a;
*res_b = *(uint16_t*)&tmp_b;
*res_c = *(uint16_t*)&tmp_c;
*res_d = *(uint16_t*)&tmp_d;
*res_e = *(uint16_t*)&tmp_e;
*res_f = *(uint16_t*)&tmp_f;
*res_g = *(uint16_t*)&tmp_g;
*res_h = *(uint16_t*)&tmp_h;
}
void float16_to_float32(
uint16_t a, uint16_t b, uint16_t c, uint16_t d,
uint16_t e, uint16_t f, uint16_t g, uint16_t h,
float* res_a, float* res_b, float* res_c, float* res_d,
float* res_e, float* res_f, float* res_g, float* res_h)
{
_Float16 tmp_a = *(_Float16*)&a;
_Float16 tmp_b = *(_Float16*)&b;
_Float16 tmp_c = *(_Float16*)&c;
_Float16 tmp_d = *(_Float16*)&d;
_Float16 tmp_e = *(_Float16*)&e;
_Float16 tmp_f = *(_Float16*)&f;
_Float16 tmp_g = *(_Float16*)&g;
_Float16 tmp_h = *(_Float16*)&h;
*res_a = (float)tmp_a;
*res_b = (float)tmp_b;
*res_c = (float)tmp_c;
*res_d = (float)tmp_d;
*res_e = (float)tmp_e;
*res_f = (float)tmp_f;
*res_g = (float)tmp_g;
*res_h = (float)tmp_h;
}
*/
import "C"
// No need to import the entire fmt and math packages just to prove a point...
// Not a big performance impact, just me hating myself.
import "os"
import "time"
import "strconv"
// Software implementation for comparison, see comment near the end
// Type defined by this package is also equivalent to uint16
import "github.com/x448/float16"
const (
E = 2.71828182845904523536028747135266249775724709369995957496696763
Pi = 3.14159265358979323846264338327950288419716939937510582097494459
Phi = 1.61803398874989484820458683436563811772030917980576286213544862
Sqrt2 = 1.41421356237309504880168872420969807856967187537694807317667974
SqrtE = 1.64872127070012814684865078781416357165377610071014801157507931
SqrtPi = 1.77245385090551602729816748334114518279754945612238712821380779
SqrtPhi = 1.27201964951406896425242246173749149171560804184009624861664038
Ln2 = 0.693147180559945309417232121458176568075500134360255254120680009
)
func log(message string) {
os.Stdout.WriteString(message + "\n")
}
func printSince(a time.Time, name string) {
log(name + " took " + time.Since(a).String())
}
func fF32(a float32) string {
return (strconv.FormatFloat(float64(a), 'f', -1, 32))
}
func format8float32(a, b, c, d, e, f, g, h float32) string {
return (fF32(a) + " " + fF32(b) + " " + fF32(c) + " " + fF32(d) + " " + fF32(e) + " " + fF32(f) + " " + fF32(g) + " " + fF32(h))
}
func f32tof16(
a, b, c, d, e, f, g, h float32,
resA, resB, resC, resD, resE, resF, resG, resH *uint16) {
// I *will* regret this later.
defer printSince(time.Now(), "f32tof16")
C.float32_to_float16(
C.float(a), C.float(b), C.float(c), C.float(d),
C.float(e), C.float(f), C.float(g), C.float(h),
(*C.uint16_t)(resA), (*C.uint16_t)(resB), (*C.uint16_t)(resC), (*C.uint16_t)(resD),
(*C.uint16_t)(resE), (*C.uint16_t)(resF), (*C.uint16_t)(resG), (*C.uint16_t)(resH),
)
}
func f16tof32(
a, b, c, d, e, f, g, h uint16,
resA, resB, resC, resD, resE, resF, resG, resH *float32) {
defer printSince(time.Now(), "f16tof32")
C.float16_to_float32(
C.uint16_t(a), C.uint16_t(b), C.uint16_t(c), C.uint16_t(d),
C.uint16_t(e), C.uint16_t(f), C.uint16_t(g), C.uint16_t(h),
(*C.float)(resA), (*C.float)(resB), (*C.float)(resC), (*C.float)(resD),
(*C.float)(resE), (*C.float)(resF), (*C.float)(resG), (*C.float)(resH),
)
}
func f16tof32_without_log(
a, b, c, d, e, f, g, h uint16,
resA, resB, resC, resD, resE, resF, resG, resH *float32) {
C.float16_to_float32(
C.uint16_t(a), C.uint16_t(b), C.uint16_t(c), C.uint16_t(d),
C.uint16_t(e), C.uint16_t(f), C.uint16_t(g), C.uint16_t(h),
(*C.float)(resA), (*C.float)(resB), (*C.float)(resC), (*C.float)(resD),
(*C.float)(resE), (*C.float)(resF), (*C.float)(resG), (*C.float)(resH),
)
}
func main() {
var f16f32 [65536]float32
a, b, c, d, e, f, g, h := new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16)
i, j, k, l, m, n, o, p := new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32)
// originally used *uint16, sigsegv, fastest solution is this
log("Originals: " + format8float32(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2))
f32tof16(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2,
a, b, c, d, e, f, g, h)
f16tof32(*a, *b, *c, *d, *e, *f, *g, *h, i, j, k, l, m, n, o, p)
// That's a lotta arguements! And barely readable.
// It's worth to demonstrate the hypothetical performance benefit of having this implemented on language level and a loop getting optimized this way, however.
// I am not sure if it's spelled optimized or optimised, and at this point I am too afraid to ask...
log("Converted: " + format8float32(*i, *j, *k, *l, *m, *n, *o, *p) +
"\n Making a lookup table using it (also demostrates the speed of looping through 65535 values)...")
func() {
defer printSince(time.Now(), "Generation of little bobby tables")
ti, tj, tk, tl, tm, tn, to, tp := new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32)
past_one := false
for c := uint16(0); c != 0 || !past_one; c += 8 {
past_one = true
// Gawd, this looks so wrong...
f16tof32_without_log(c, c+1, c+2, c+3, c+4, c+5, c+6, c+7, ti, tj, tk, tl, tm, tn, to, tp)
f16f32[c], f16f32[c+1], f16f32[c+2], f16f32[c+3], f16f32[c+4], f16f32[c+5], f16f32[c+6], f16f32[c+7] = *ti, *tj, *tk, *tl, *tm, *tn, *to, *tp
//log(strconv.FormatUint(uint64(c), 10)) // the debug
}
}()
// I am too tired to scroll back and make those into actual functions...
/*
func() {
var tmp_str string
for i, j := range f16f32 {
tmp_str += strconv.FormatUint(uint64(i), 10) + ": " + fF32(j)
log(strconv.FormatUint(uint64(i), 10)) // the debug
}
log(tmp_str)
}()
*/ // was too slow
log("Originals: " + format8float32(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2))
// At this point I don't know what I am doing anymore...
func() {
defer printSince(time.Now(), "float32 >> float16 in software")
// Nvm, I am stupid. I can't just lookup backwards (unless the float32 rounds up to a float16).
// I used someone else's module.
var ta, tb, tc, td, te, tf, tg, th uint16
ta, tb, tc, td, te, tf, tg, th = uint16(float16.Fromfloat32(*i)), uint16(float16.Fromfloat32(*j)), uint16(float16.Fromfloat32(*k)), uint16(float16.Fromfloat32(*l)), uint16(float16.Fromfloat32(*m)), uint16(float16.Fromfloat32(*n)), uint16(float16.Fromfloat32(*o)), uint16(float16.Fromfloat32(*p))
a, b, c, d, e, f, g, h = &ta, &tb, &tc, &td, &te, &tf, &tg, &th
}()
// Kill me.
func() {
defer printSince(time.Now(), "Lookup of float16 >> float32")
i, j, k, l, m, n, o, p = &f16f32[*a], &f16f32[*b], &f16f32[*c], &f16f32[*d], &f16f32[*e], &f16f32[*f], &f16f32[*g], &f16f32[*h]
}()
log("Converted: " + format8float32(*i, *j, *k, *l, *m, *n, *o, *p))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment