diaphragm-workplace/float16.go

## float16.go
package main

/*
#cgo CFLAGS: -march=native -mtune=native -Ofast -flto
#cgo LDFLAGS: -march=native -mtune=native -Ofast
#include <stdint.h>

// Optimized out into a single vector instruction on x86 compatibles and aarch64

void float32_to_float16(
    float a, float b, float c, float d,
    float e, float f, float g, float h,
    uint16_t* res_a, uint16_t* res_b, uint16_t* res_c, uint16_t* res_d,
    uint16_t* res_e, uint16_t* res_f, uint16_t* res_g, uint16_t* res_h)
{
    _Float16 tmp_a = (_Float16)a;
    _Float16 tmp_b = (_Float16)b;
    _Float16 tmp_c = (_Float16)c;
    _Float16 tmp_d = (_Float16)d;
    _Float16 tmp_e = (_Float16)e;
    _Float16 tmp_f = (_Float16)f;
    _Float16 tmp_g = (_Float16)g;
    _Float16 tmp_h = (_Float16)h;

    *res_a = *(uint16_t*)&tmp_a;
    *res_b = *(uint16_t*)&tmp_b;
    *res_c = *(uint16_t*)&tmp_c;
    *res_d = *(uint16_t*)&tmp_d;
    *res_e = *(uint16_t*)&tmp_e;
    *res_f = *(uint16_t*)&tmp_f;
    *res_g = *(uint16_t*)&tmp_g;
    *res_h = *(uint16_t*)&tmp_h;
}

void float16_to_float32(
    uint16_t a, uint16_t b, uint16_t c, uint16_t d,
    uint16_t e, uint16_t f, uint16_t g, uint16_t h,
    float* res_a, float* res_b, float* res_c, float* res_d,
    float* res_e, float* res_f, float* res_g, float* res_h)
{
    _Float16 tmp_a = *(_Float16*)&a;
    _Float16 tmp_b = *(_Float16*)&b;
    _Float16 tmp_c = *(_Float16*)&c;
    _Float16 tmp_d = *(_Float16*)&d;
    _Float16 tmp_e = *(_Float16*)&e;
    _Float16 tmp_f = *(_Float16*)&f;
    _Float16 tmp_g = *(_Float16*)&g;
    _Float16 tmp_h = *(_Float16*)&h;

    *res_a = (float)tmp_a;
    *res_b = (float)tmp_b;
    *res_c = (float)tmp_c;
    *res_d = (float)tmp_d;
    *res_e = (float)tmp_e;
    *res_f = (float)tmp_f;
    *res_g = (float)tmp_g;
    *res_h = (float)tmp_h;
}
*/
import "C"

// No need to import the entire fmt and math packages just to prove a point...
// Not a big performance impact, just me hating myself.
import "os"
import "time"
import "strconv"

// Software implementation for comparison, see comment near the end
// Type defined by this package is also equivalent to uint16
import "github.com/x448/float16"

const (
	E       = 2.71828182845904523536028747135266249775724709369995957496696763
	Pi      = 3.14159265358979323846264338327950288419716939937510582097494459
	Phi     = 1.61803398874989484820458683436563811772030917980576286213544862
	Sqrt2   = 1.41421356237309504880168872420969807856967187537694807317667974
	SqrtE   = 1.64872127070012814684865078781416357165377610071014801157507931
	SqrtPi  = 1.77245385090551602729816748334114518279754945612238712821380779
	SqrtPhi = 1.27201964951406896425242246173749149171560804184009624861664038
	Ln2     = 0.693147180559945309417232121458176568075500134360255254120680009
)

func log(message string) {
	os.Stdout.WriteString(message + "\n")
}

func printSince(a time.Time, name string) {
	log(name + " took " + time.Since(a).String())
}

func fF32(a float32) string {
	return (strconv.FormatFloat(float64(a), 'f', -1, 32))
}

func format8float32(a, b, c, d, e, f, g, h float32) string {
	return (fF32(a) + " " + fF32(b) + " " + fF32(c) + " " + fF32(d) + " " + fF32(e) + " " + fF32(f) + " " + fF32(g) + " " + fF32(h))
}

func f32tof16(
	a, b, c, d, e, f, g, h float32,
	resA, resB, resC, resD, resE, resF, resG, resH *uint16) {
	// I *will* regret this later.
	defer printSince(time.Now(), "f32tof16")

	C.float32_to_float16(
		C.float(a), C.float(b), C.float(c), C.float(d),
		C.float(e), C.float(f), C.float(g), C.float(h),
		(*C.uint16_t)(resA), (*C.uint16_t)(resB), (*C.uint16_t)(resC), (*C.uint16_t)(resD),
		(*C.uint16_t)(resE), (*C.uint16_t)(resF), (*C.uint16_t)(resG), (*C.uint16_t)(resH),
	)
}

func f16tof32(
	a, b, c, d, e, f, g, h uint16,
	resA, resB, resC, resD, resE, resF, resG, resH *float32) {

	defer printSince(time.Now(), "f16tof32")

	C.float16_to_float32(
		C.uint16_t(a), C.uint16_t(b), C.uint16_t(c), C.uint16_t(d),
		C.uint16_t(e), C.uint16_t(f), C.uint16_t(g), C.uint16_t(h),
		(*C.float)(resA), (*C.float)(resB), (*C.float)(resC), (*C.float)(resD),
		(*C.float)(resE), (*C.float)(resF), (*C.float)(resG), (*C.float)(resH),
	)
}

func f16tof32_without_log(
	a, b, c, d, e, f, g, h uint16,
	resA, resB, resC, resD, resE, resF, resG, resH *float32) {

	C.float16_to_float32(
		C.uint16_t(a), C.uint16_t(b), C.uint16_t(c), C.uint16_t(d),
		C.uint16_t(e), C.uint16_t(f), C.uint16_t(g), C.uint16_t(h),
		(*C.float)(resA), (*C.float)(resB), (*C.float)(resC), (*C.float)(resD),
		(*C.float)(resE), (*C.float)(resF), (*C.float)(resG), (*C.float)(resH),
	)
}

func main() {
	var f16f32 [65536]float32
	a, b, c, d, e, f, g, h := new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16)
	i, j, k, l, m, n, o, p := new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32)

	// originally used *uint16, sigsegv, fastest solution is this

	log("Originals: " + format8float32(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2))

	f32tof16(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2,
		a, b, c, d, e, f, g, h)
	f16tof32(*a, *b, *c, *d, *e, *f, *g, *h, i, j, k, l, m, n, o, p)
	// That's a lotta arguements! And barely readable.
	// It's worth to demonstrate the hypothetical performance benefit of having this implemented on language level and a loop getting optimized this way, however.
	// I am not sure if it's spelled optimized or optimised, and at this point I am too afraid to ask...

	log("Converted: " + format8float32(*i, *j, *k, *l, *m, *n, *o, *p) +
		"\n Making a lookup table using it (also demostrates the speed of looping through 65535 values)...")

	func() {
		defer printSince(time.Now(), "Generation of little bobby tables")
		ti, tj, tk, tl, tm, tn, to, tp := new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32)
		past_one := false
		for c := uint16(0); c != 0 || !past_one; c += 8 {
			past_one = true
			// Gawd, this looks so wrong...
			f16tof32_without_log(c, c+1, c+2, c+3, c+4, c+5, c+6, c+7, ti, tj, tk, tl, tm, tn, to, tp)
			f16f32[c], f16f32[c+1], f16f32[c+2], f16f32[c+3], f16f32[c+4], f16f32[c+5], f16f32[c+6], f16f32[c+7] = *ti, *tj, *tk, *tl, *tm, *tn, *to, *tp
			//log(strconv.FormatUint(uint64(c), 10)) // the debug
		}
	}()
	// I am too tired to scroll back and make those into actual functions...
	/*
		func() {
			var tmp_str string
			for i, j := range f16f32 {
				tmp_str += strconv.FormatUint(uint64(i), 10) + ": " + fF32(j)
				log(strconv.FormatUint(uint64(i), 10)) // the debug
			}
			log(tmp_str)
		}()
	*/ // was too slow
	log("Originals: " + format8float32(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2))

	// At this point I don't know what I am doing anymore...

	func() {
		defer printSince(time.Now(), "float32 >> float16 in software")
		// Nvm, I am stupid. I can't just lookup backwards (unless the float32 rounds up to a float16).
		// I used someone else's module.
		var ta, tb, tc, td, te, tf, tg, th uint16
		ta, tb, tc, td, te, tf, tg, th = uint16(float16.Fromfloat32(*i)), uint16(float16.Fromfloat32(*j)), uint16(float16.Fromfloat32(*k)), uint16(float16.Fromfloat32(*l)), uint16(float16.Fromfloat32(*m)), uint16(float16.Fromfloat32(*n)), uint16(float16.Fromfloat32(*o)), uint16(float16.Fromfloat32(*p))
		a, b, c, d, e, f, g, h = &ta, &tb, &tc, &td, &te, &tf, &tg, &th
	}()
	// Kill me.
	func() {
		defer printSince(time.Now(), "Lookup of float16 >> float32")
		i, j, k, l, m, n, o, p = &f16f32[*a], &f16f32[*b], &f16f32[*c], &f16f32[*d], &f16f32[*e], &f16f32[*f], &f16f32[*g], &f16f32[*h]
	}()
	log("Converted: " + format8float32(*i, *j, *k, *l, *m, *n, *o, *p))

}
	package main

	/*
	#cgo CFLAGS: -march=native -mtune=native -Ofast -flto
	#cgo LDFLAGS: -march=native -mtune=native -Ofast
	#include <stdint.h>

	// Optimized out into a single vector instruction on x86 compatibles and aarch64

	void float32_to_float16(
	float a, float b, float c, float d,
	float e, float f, float g, float h,
	uint16_t* res_a, uint16_t* res_b, uint16_t* res_c, uint16_t* res_d,
	uint16_t* res_e, uint16_t* res_f, uint16_t* res_g, uint16_t* res_h)
	{
	_Float16 tmp_a = (_Float16)a;
	_Float16 tmp_b = (_Float16)b;
	_Float16 tmp_c = (_Float16)c;
	_Float16 tmp_d = (_Float16)d;
	_Float16 tmp_e = (_Float16)e;
	_Float16 tmp_f = (_Float16)f;
	_Float16 tmp_g = (_Float16)g;
	_Float16 tmp_h = (_Float16)h;

	res_a = (uint16_t*)&tmp_a;
	res_b = (uint16_t*)&tmp_b;
	res_c = (uint16_t*)&tmp_c;
	res_d = (uint16_t*)&tmp_d;
	res_e = (uint16_t*)&tmp_e;
	res_f = (uint16_t*)&tmp_f;
	res_g = (uint16_t*)&tmp_g;
	res_h = (uint16_t*)&tmp_h;
	}

	void float16_to_float32(
	uint16_t a, uint16_t b, uint16_t c, uint16_t d,
	uint16_t e, uint16_t f, uint16_t g, uint16_t h,
	float* res_a, float* res_b, float* res_c, float* res_d,
	float* res_e, float* res_f, float* res_g, float* res_h)
	{
	_Float16 tmp_a = (_Float16)&a;
	_Float16 tmp_b = (_Float16)&b;
	_Float16 tmp_c = (_Float16)&c;
	_Float16 tmp_d = (_Float16)&d;
	_Float16 tmp_e = (_Float16)&e;
	_Float16 tmp_f = (_Float16)&f;
	_Float16 tmp_g = (_Float16)&g;
	_Float16 tmp_h = (_Float16)&h;

	*res_a = (float)tmp_a;
	*res_b = (float)tmp_b;
	*res_c = (float)tmp_c;
	*res_d = (float)tmp_d;
	*res_e = (float)tmp_e;
	*res_f = (float)tmp_f;
	*res_g = (float)tmp_g;
	*res_h = (float)tmp_h;
	}
	*/
	import "C"

	// No need to import the entire fmt and math packages just to prove a point...
	// Not a big performance impact, just me hating myself.
	import "os"
	import "time"
	import "strconv"

	// Software implementation for comparison, see comment near the end
	// Type defined by this package is also equivalent to uint16
	import "github.com/x448/float16"

	const (
	E = 2.71828182845904523536028747135266249775724709369995957496696763
	Pi = 3.14159265358979323846264338327950288419716939937510582097494459
	Phi = 1.61803398874989484820458683436563811772030917980576286213544862
	Sqrt2 = 1.41421356237309504880168872420969807856967187537694807317667974
	SqrtE = 1.64872127070012814684865078781416357165377610071014801157507931
	SqrtPi = 1.77245385090551602729816748334114518279754945612238712821380779
	SqrtPhi = 1.27201964951406896425242246173749149171560804184009624861664038
	Ln2 = 0.693147180559945309417232121458176568075500134360255254120680009
	)

	func log(message string) {
	os.Stdout.WriteString(message + "\n")
	}

	func printSince(a time.Time, name string) {
	log(name + " took " + time.Since(a).String())
	}

	func fF32(a float32) string {
	return (strconv.FormatFloat(float64(a), 'f', -1, 32))
	}

	func format8float32(a, b, c, d, e, f, g, h float32) string {
	return (fF32(a) + " " + fF32(b) + " " + fF32(c) + " " + fF32(d) + " " + fF32(e) + " " + fF32(f) + " " + fF32(g) + " " + fF32(h))
	}

	func f32tof16(
	a, b, c, d, e, f, g, h float32,
	resA, resB, resC, resD, resE, resF, resG, resH *uint16) {
	// I will regret this later.
	defer printSince(time.Now(), "f32tof16")

	C.float32_to_float16(
	C.float(a), C.float(b), C.float(c), C.float(d),
	C.float(e), C.float(f), C.float(g), C.float(h),
	(C.uint16_t)(resA), (C.uint16_t)(resB), (C.uint16_t)(resC), (C.uint16_t)(resD),
	(C.uint16_t)(resE), (C.uint16_t)(resF), (C.uint16_t)(resG), (C.uint16_t)(resH),
	)
	}

	func f16tof32(
	a, b, c, d, e, f, g, h uint16,
	resA, resB, resC, resD, resE, resF, resG, resH *float32) {

	defer printSince(time.Now(), "f16tof32")

	C.float16_to_float32(
	C.uint16_t(a), C.uint16_t(b), C.uint16_t(c), C.uint16_t(d),
	C.uint16_t(e), C.uint16_t(f), C.uint16_t(g), C.uint16_t(h),
	(C.float)(resA), (C.float)(resB), (C.float)(resC), (C.float)(resD),
	(C.float)(resE), (C.float)(resF), (C.float)(resG), (C.float)(resH),
	)
	}

	func f16tof32_without_log(
	a, b, c, d, e, f, g, h uint16,
	resA, resB, resC, resD, resE, resF, resG, resH *float32) {

	C.float16_to_float32(
	C.uint16_t(a), C.uint16_t(b), C.uint16_t(c), C.uint16_t(d),
	C.uint16_t(e), C.uint16_t(f), C.uint16_t(g), C.uint16_t(h),
	(C.float)(resA), (C.float)(resB), (C.float)(resC), (C.float)(resD),
	(C.float)(resE), (C.float)(resF), (C.float)(resG), (C.float)(resH),
	)
	}

	func main() {
	var f16f32 [65536]float32
	a, b, c, d, e, f, g, h := new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16), new(uint16)
	i, j, k, l, m, n, o, p := new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32)

	// originally used *uint16, sigsegv, fastest solution is this

	log("Originals: " + format8float32(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2))

	f32tof16(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2,
	a, b, c, d, e, f, g, h)
	f16tof32(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)
	// That's a lotta arguements! And barely readable.
	// It's worth to demonstrate the hypothetical performance benefit of having this implemented on language level and a loop getting optimized this way, however.
	// I am not sure if it's spelled optimized or optimised, and at this point I am too afraid to ask...

	log("Converted: " + format8float32(i, j, k, l, m, n, o, p) +
	"\n Making a lookup table using it (also demostrates the speed of looping through 65535 values)...")

	func() {
	defer printSince(time.Now(), "Generation of little bobby tables")
	ti, tj, tk, tl, tm, tn, to, tp := new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32), new(float32)
	past_one := false
	for c := uint16(0); c != 0 \|\| !past_one; c += 8 {
	past_one = true
	// Gawd, this looks so wrong...
	f16tof32_without_log(c, c+1, c+2, c+3, c+4, c+5, c+6, c+7, ti, tj, tk, tl, tm, tn, to, tp)
	f16f32[c], f16f32[c+1], f16f32[c+2], f16f32[c+3], f16f32[c+4], f16f32[c+5], f16f32[c+6], f16f32[c+7] = ti, tj, tk, tl, tm, tn, to, tp
	//log(strconv.FormatUint(uint64(c), 10)) // the debug
	}
	}()
	// I am too tired to scroll back and make those into actual functions...
	/*
	func() {
	var tmp_str string
	for i, j := range f16f32 {
	tmp_str += strconv.FormatUint(uint64(i), 10) + ": " + fF32(j)
	log(strconv.FormatUint(uint64(i), 10)) // the debug
	}
	log(tmp_str)
	}()
	*/ // was too slow
	log("Originals: " + format8float32(E, Pi, Phi, Sqrt2, SqrtE, SqrtPi, SqrtPhi, Ln2))

	// At this point I don't know what I am doing anymore...

	func() {
	defer printSince(time.Now(), "float32 >> float16 in software")
	// Nvm, I am stupid. I can't just lookup backwards (unless the float32 rounds up to a float16).
	// I used someone else's module.
	var ta, tb, tc, td, te, tf, tg, th uint16
	ta, tb, tc, td, te, tf, tg, th = uint16(float16.Fromfloat32(i)), uint16(float16.Fromfloat32(j)), uint16(float16.Fromfloat32(k)), uint16(float16.Fromfloat32(l)), uint16(float16.Fromfloat32(m)), uint16(float16.Fromfloat32(n)), uint16(float16.Fromfloat32(o)), uint16(float16.Fromfloat32(p))
	a, b, c, d, e, f, g, h = &ta, &tb, &tc, &td, &te, &tf, &tg, &th
	}()
	// Kill me.
	func() {
	defer printSince(time.Now(), "Lookup of float16 >> float32")
	i, j, k, l, m, n, o, p = &f16f32[a], &f16f32[b], &f16f32[c], &f16f32[d], &f16f32[e], &f16f32[f], &f16f32[g], &f16f32[h]
	}()
	log("Converted: " + format8float32(i, j, k, l, m, n, o, p))

	}
No results found