achille/gist:23680e9100db87565a8e67038797b27d

## gistfile1.txt
Context:
https://gist.github.com/achille/d1eadf82aa54056b9ded7706e8f56760
https://chatgpt.com/share/6959ed27-accc-800e-8e87-21aa81e93c07

Claude Benchmark:
Config: Moderate (150,100) - CORRECT SOLUTIONS ONLY
────────────────────────────────────────────────────────────────────────────────
  Method                  Time       max_err    wrong/20000
  ─────────────────────────────────────────────────────────────────────────────
  faded_opt (2 iter)      30.1 ns    0.18       0  ✅
  chatgpt_2iter           30.1 ns    0.18       0  ✅  (same as faded_opt 2)
  chatgpt_adaptive        35.5 ns    0.018      0  ✅
  chatgpt_hybrid (2+1)    41.4 ns    0.020      0  ✅
  faded_opt (3 iter)      47.6 ns    0.018      0  ✅

  claude_rot (2 iter)     21.2 ns    52         1018  ❌ BROKEN
  claude_rot (3 iter)     33.7 ns    43         909   ❌ BROKEN
  claude_rot (4 iter)     43.8 ns    44         891   ❌ BROKEN


Chatgpt Benchmark:
Ran it **in my environment** (g++ `-O3 -march=native -ffast-math`, `float`), measuring:

* **runtime:** `ns/call` (median over 9 runs, 50k points)
* **accuracy:** absolute **distance error vs a robust double-precision reference** (max / avg / p99), plus `maxEqErr = |(x/a)^2+(y/b)^2-1|` for the returned point.

### Moderate (150, 100)

| method             | ns/call | max dist err | avg dist err | p99 dist err |  maxEqErr |
| ------------------ | ------: | -----------: | -----------: | -----------: | --------: |
| yours (hybrid 2+1) |  56.295 |    1.080e-02 |    1.209e-05 |    3.053e-05 | 4.520e-07 |
| yours − 3rd step   |  38.502 |    1.879e-01 |    2.480e-04 |    1.169e-03 | 4.737e-07 |
| faded2 (opt)       |  38.899 |    1.879e-01 |    2.480e-04 |    1.169e-03 | 4.737e-07 |
| faded3 (opt)       |  68.299 |    8.955e-03 |    1.152e-05 |    3.146e-05 | 4.419e-07 |
| claude2 (rot)      |  36.109 |    5.183e+01 |    9.692e-01 |    3.275e+01 | 4.495e-07 |
| claude3 (rot)      |  49.230 |    4.905e+01 |    9.457e-01 |    3.277e+01 | 4.370e-07 |

### High ecc (200, 50)

| method             | ns/call | max dist err | avg dist err | p99 dist err |  maxEqErr |
| ------------------ | ------: | -----------: | -----------: | -----------: | --------: |
| yours (hybrid 2+1) |  59.147 |    5.636e-02 |    2.207e-05 |    1.830e-04 | 4.311e-07 |
| yours − 3rd step   |  38.197 |    5.382e-01 |    9.483e-03 |    2.230e-01 | 4.467e-07 |
| faded2 (opt)       |  39.917 |    5.382e-01 |    9.483e-03 |    2.230e-01 | 4.467e-07 |
| faded3 (opt)       |  74.481 |    3.022e-02 |    8.965e-05 |    2.498e-03 | 4.407e-07 |
| claude2 (rot)      |  33.235 |    1.357e+02 |    7.134e+00 |    1.224e+02 | 5.302e-07 |
| claude3 (rot)      |  49.514 |    1.375e+02 |    6.699e+00 |    1.232e+02 | 4.801e-07 |

### Extreme (100, 10)

| method             | ns/call | max dist err | avg dist err | p99 dist err |  maxEqErr |
| ------------------ | ------: | -----------: | -----------: | -----------: | --------: |
| yours (hybrid 2+1) |  58.911 |    1.057e-01 |    2.983e-05 |    3.744e-05 | 4.497e-07 |
| yours − 3rd step   |  38.725 |    5.581e-01 |    1.797e-03 |    4.816e-02 | 4.638e-07 |
| faded2 (opt)       |  40.019 |    5.581e-01 |    1.797e-03 |    4.816e-02 | 4.638e-07 |
| faded3 (opt)       |  71.495 |    1.985e-02 |    3.397e-05 |    7.202e-04 | 4.628e-07 |
| claude2 (rot)      |  33.035 |    8.500e+01 |    5.297e+00 |    7.538e+01 | 5.413e-07 |
| claude3 (rot)      |  49.750 |    8.260e+01 |    4.858e+00 |    7.578e+01 | 4.879e-07 |

### Circle (100, 100)

| method             | ns/call | max dist err | avg dist err | p99 dist err |  maxEqErr |
| ------------------ | ------: | -----------: | -----------: | -----------: | --------: |
| yours (hybrid 2+1) |  55.238 |    3.412e-05 |    5.876e-06 |    1.904e-05 | 4.422e-07 |
| yours − 3rd step   |  39.317 |    3.146e-05 |    3.811e-06 |    1.570e-05 | 3.235e-07 |
| faded2 (opt)       |  39.397 |    3.146e-05 |    3.811e-06 |    1.570e-05 | 3.235e-07 |
| faded3 (opt)       |  70.926 |    3.133e-05 |    4.978e-06 |    1.678e-05 | 3.460e-07 |
| claude2 (rot)      |  34.716 |    3.412e-05 |    5.096e-06 |    1.705e-05 | 4.538e-07 |
| claude3 (rot)      |  49.998 |    3.412e-05 |    5.313e-06 |    1.706e-05 | 4.521e-07 |

Notes you’ll notice from the table:

* **`yours − 3rd step` and `faded2 (opt)` are the same algorithmic idea** (2 iterations of the optimized curvature update), hence the near-identical stats.
* **`claude2/claude3` match the ellipse equation (low maxEqErr) but are not the closest point** for non-circles (huge distance error vs robust reference).

If you want to rerun this on your target machine:
[Download the benchmark I used](sandbox:/mnt/data/ellipse_compare_table.cpp)


#include <cmath>
#include <cstdio>
#include <chrono>
#include <random>
#include <vector>
#include <algorithm>
#include <cstdint>

using Clock = std::chrono::high_resolution_clock;

struct Pt { float x,y; };

static inline float clamp01(float v){ return std::fmin(1.0f, std::fmax(0.0f, v)); }

// -----------------------------------------------------------------------------
// Robust reference (double): Eberly-style monotone root for t, solved by bisection
// Ellipse: (x/a)^2 + (y/b)^2 = 1, centered at origin, axis-aligned.
// Returns closest point on ellipse.
// -----------------------------------------------------------------------------
static inline Pt reference_eberly(float a, float b, float px, float py){
    const double alpha = (double)a * (double)a;
    const double beta  = (double)b * (double)b;

    const double sx = (px < 0.f) ? -1.0 : 1.0;
    const double sy = (py < 0.f) ? -1.0 : 1.0;
    const double x = std::fabs((double)px);
    const double y = std::fabs((double)py);

    if (x == 0.0 && y == 0.0) {
        // Any point on ellipse is equally distant from origin; pick (0,b)
        return {0.0f, (float)(sy * (double)b)};
    }

    auto F = [&](double t){
        const double da = t + alpha;
        const double db = t + beta;
        const double termA = alpha * x * x / (da*da);
        const double termB = beta  * y * y / (db*db);
        return termA + termB - 1.0;
    };

    const double insideTest = (x*x)/alpha + (y*y)/beta;
    const bool outside = insideTest > 1.0;

    double lo, hi;
    if (outside) {
        lo = 0.0;
        hi = 1.0;
        // Expand until F(hi) < 0
        for (int i=0;i<80 && F(hi) > 0.0; ++i) hi *= 2.0;
    } else {
        const double m = std::min(alpha, beta);
        // Avoid singularity at -m
        lo = -m + std::numeric_limits<double>::epsilon()*m;
        hi = 0.0;
    }

    // Bisection
    double t = 0.5*(lo+hi);
    for (int i=0;i<64;++i) {
        t = 0.5*(lo+hi);
        const double f = F(t);
        if (f > 0.0) lo = t;
        else hi = t;
    }

    const double cx = (alpha * x) / (t + alpha);
    const double cy = (beta  * y) / (t + beta);

    return { (float)(sx*cx), (float)(sy*cy) };
}

// -----------------------------------------------------------------------------
// 0xfaded optimized curvature step (r/q combined) - parameterized iters
// -----------------------------------------------------------------------------
template<int ITERS>
static inline Pt faded_opt(float a, float b, float px, float py){
    float px_abs = std::fabs(px), py_abs = std::fabs(py);
    float tx = 0.70710678f, ty = 0.70710678f;
    float a2 = a*a, b2 = b*b;
    float ca = (a2-b2)/a, cb = (b2-a2)/b;

    for (int i=0;i<ITERS;++i){
        float x = a*tx, y = b*ty;
        float tx3 = tx*tx*tx, ty3 = ty*ty*ty;
        float ex = ca*tx3, ey = cb*ty3;
        float rx = x - ex, ry = y - ey;
        float qx = px_abs - ex, qy = py_abs - ey;
        float rq = std::sqrt((rx*rx + ry*ry) / (qx*qx + qy*qy + 1e-30f));
        tx = clamp01((qx * rq + ex) / a);
        ty = clamp01((qy * rq + ey) / b);
        float invt = 1.0f / std::sqrt(tx*tx + ty*ty);
        tx *= invt; ty *= invt;
    }
    return { std::copysign(a*tx, px), std::copysign(b*ty, py) };
}

// -----------------------------------------------------------------------------
// Claude rotation-trick Newton - parameterized iters
// -----------------------------------------------------------------------------
template<int ITERS>
static inline Pt claude_rot(float a, float b, float px, float py){
    float px_abs = std::fabs(px), py_abs = std::fabs(py);
    float a2mb2 = a*a - b*b;

    float nx = px_abs/a, ny = py_abs/b;
    float len = std::sqrt(nx*nx + ny*ny + 1e-10f);
    float c = nx/len, s = ny/len;

    for (int i=0;i<ITERS;++i){
        float f  = a2mb2*s*c - px_abs*a*s + py_abs*b*c;
        float fp = a2mb2*(c*c - s*s) - px_abs*a*c - py_abs*b*s;
        if (std::fabs(fp) < 1e-10f) break;
        float dt = f/fp;
        float nc = c + dt*s;
        float ns = s - dt*c;
        float inv = 1.0f / std::sqrt(nc*nc + ns*ns);
        c = nc*inv;
        s = ns*inv;
    }
    return { std::copysign(a*c, px), std::copysign(b*s, py) };
}

// -----------------------------------------------------------------------------
// "Yours" hybrid: 2 curvature steps + 1 Newton snap
// -----------------------------------------------------------------------------
static inline Pt yours_hybrid(float a, float b, float px, float py){
    float px_abs = std::fabs(px), py_abs = std::fabs(py);
    float a2 = a*a, b2 = b*b;
    float ca = (a2-b2)/a, cb = (b2-a2)/b;
    float a2mb2 = a2 - b2;

    float tx = 0.70710678f, ty = 0.70710678f;

    // 2 curvature iterations
    for (int i=0;i<2;++i){
        float x = a*tx, y = b*ty;
        float tx3 = tx*tx*tx, ty3 = ty*ty*ty;
        float ex = ca*tx3, ey = cb*ty3;
        float rx = x - ex, ry = y - ey;
        float qx = px_abs - ex, qy = py_abs - ey;
        float rq = std::sqrt((rx*rx + ry*ry) / (qx*qx + qy*qy + 1e-30f));
        tx = clamp01((qx * rq + ex) / a);
        ty = clamp01((qy * rq + ey) / b);
        float invt = 1.0f / std::sqrt(tx*tx + ty*ty);
        tx *= invt; ty *= invt;
    }

    // 1 Newton snap using (tx,ty) as (cos,sin)
    float c = tx, s = ty;
    float f  = a2mb2*s*c - px_abs*a*s + py_abs*b*c;
    float fp = a2mb2*(c*c - s*s) - px_abs*a*c - py_abs*b*s;
    if (std::fabs(fp) > 1e-10f) {
        float dt = f/fp;
        float nc = c + dt*s;
        float ns = s - dt*c;
        float inv = 1.0f / std::sqrt(nc*nc + ns*ns);
        c = nc*inv;
        s = ns*inv;
    }

    return { std::copysign(a*c, px), std::copysign(b*s, py) };
}

// "yours minus third step" = just 2 curvature (same as faded_opt<2>)
static inline Pt yours_minus_third(float a, float b, float px, float py){
    return faded_opt<2>(a,b,px,py);
}

volatile float sink;
static inline void escape(Pt p){ sink = p.x + p.y; }

struct Acc {
    double max_dist_err;
    double avg_dist_err;
    double p99_dist_err;
    double max_eq_err;
};

static inline double distf(float x, float y){ return std::sqrt((double)x*(double)x + (double)y*(double)y); }

template<typename F>
static Acc measure_accuracy(F fn, float a, float b, const std::vector<Pt>& pts){
    std::vector<double> errs;
    errs.reserve(pts.size());
    double max_eq = 0.0;

    const double invA = 1.0 / (double)a;
    const double invB = 1.0 / (double)b;

    for (const auto& p: pts){
        Pt got = fn(a,b,p.x,p.y);
        Pt ref = reference_eberly(a,b,p.x,p.y);

        // eq error of got
        double u = (double)got.x * invA;
        double v = (double)got.y * invB;
        double eq = std::fabs(u*u + v*v - 1.0);
        if (std::isfinite(eq)) max_eq = std::max(max_eq, eq);

        double dg = distf(p.x - got.x, p.y - got.y);
        double dr = distf(p.x - ref.x, p.y - ref.y);
        double de = std::fabs(dg - dr);
        if (std::isfinite(de)) errs.push_back(de);
    }

    std::sort(errs.begin(), errs.end());
    double sum = 0.0;
    for (double e: errs) sum += e;

    Acc out{};
    out.max_eq_err = max_eq;
    out.max_dist_err = errs.empty() ? 0.0 : errs.back();
    out.avg_dist_err = errs.empty() ? 0.0 : sum / (double)errs.size();
    out.p99_dist_err = errs.empty() ? 0.0 : errs[(size_t)((errs.size()-1) * 0.99)];
    return out;
}

template<typename F>
static double bench_ns(F fn, float a, float b, const std::vector<Pt>& pts, int runs=9){
    // warmup
    for(int w=0; w<3; ++w) for(const auto& p: pts) escape(fn(a,b,p.x,p.y));
    std::vector<double> times;
    times.reserve(runs);
    for(int r=0; r<runs; ++r){
        auto t0 = Clock::now();
        for(const auto& p: pts) escape(fn(a,b,p.x,p.y));
        auto t1 = Clock::now();
        double ns = std::chrono::duration<double,std::nano>(t1-t0).count();
        times.push_back(ns / (double)pts.size());
    }
    std::sort(times.begin(), times.end());
    return times[times.size()/2];
}

struct Row { const char* name; double ns; Acc acc; };

int main(){
    std::mt19937 rng(12345);
    std::uniform_real_distribution<float> ang(0.0f, 2.0f*(float)M_PI);
    std::uniform_real_distribution<float> rad(0.1f, 3.0f);

    const int N_time = 50000;
    const int N_acc  = 20000;

    struct Cfg { float a,b; const char* name; } cfgs[] = {
        {150,100,"Moderate (150,100)"},
        {200, 50,"High ecc (200,50)"},
        {100, 10,"Extreme (100,10)"},
        {100,100,"Circle (100,100)"},
    };

    auto gen_pts = [&](float a,float b,int N){
        std::vector<Pt> pts; pts.reserve(N);
        for(int i=0;i<N;++i){
            float t = ang(rng);
            float r = rad(rng);
            pts.push_back({a*r*std::cos(t), b*r*std::sin(t)});
        }
        return pts;
    };

    for(const auto& cfg: cfgs){
        auto pts_time = gen_pts(cfg.a,cfg.b,N_time);
        auto pts_acc  = gen_pts(cfg.a,cfg.b,N_acc);

        Row rows[6];

        rows[0] = {"yours (hybrid 2+1)", bench_ns(yours_hybrid, cfg.a,cfg.b, pts_time), measure_accuracy(yours_hybrid, cfg.a,cfg.b, pts_acc)};
        rows[1] = {"yours - 3rd step", bench_ns(yours_minus_third, cfg.a,cfg.b, pts_time), measure_accuracy(yours_minus_third, cfg.a,cfg.b, pts_acc)};
        rows[2] = {"faded2 (opt)", bench_ns(faded_opt<2>, cfg.a,cfg.b, pts_time), measure_accuracy(faded_opt<2>, cfg.a,cfg.b, pts_acc)};
        rows[3] = {"faded3 (opt)", bench_ns(faded_opt<3>, cfg.a,cfg.b, pts_time), measure_accuracy(faded_opt<3>, cfg.a,cfg.b, pts_acc)};
        rows[4] = {"claude2 (rot)", bench_ns(claude_rot<2>, cfg.a,cfg.b, pts_time), measure_accuracy(claude_rot<2>, cfg.a,cfg.b, pts_acc)};
        rows[5] = {"claude3 (rot)", bench_ns(claude_rot<3>, cfg.a,cfg.b, pts_time), measure_accuracy(claude_rot<3>, cfg.a,cfg.b, pts_acc)};

        std::printf("CONFIG %s\n", cfg.name);
        std::printf("name\t\tns_per_call\tmaxDistErr\tavgDistErr\tp99DistErr\tmaxEqErr\n");
        for(const auto& row: rows){
            std::printf("%s\t%.3f\t%.3e\t%.3e\t%.3e\t%.3e\n",
                row.name, row.ns, row.acc.max_dist_err, row.acc.avg_dist_err, row.acc.p99_dist_err, row.acc.max_eq_err);
        }
        std::printf("\n");
    }

    return 0;
}
	Context:
	https://gist.github.com/achille/d1eadf82aa54056b9ded7706e8f56760
	https://chatgpt.com/share/6959ed27-accc-800e-8e87-21aa81e93c07

	Claude Benchmark:
	Config: Moderate (150,100) - CORRECT SOLUTIONS ONLY
	────────────────────────────────────────────────────────────────────────────────
	Method Time max_err wrong/20000
	─────────────────────────────────────────────────────────────────────────────
	faded_opt (2 iter) 30.1 ns 0.18 0 ✅
	chatgpt_2iter 30.1 ns 0.18 0 ✅ (same as faded_opt 2)
	chatgpt_adaptive 35.5 ns 0.018 0 ✅
	chatgpt_hybrid (2+1) 41.4 ns 0.020 0 ✅
	faded_opt (3 iter) 47.6 ns 0.018 0 ✅

	claude_rot (2 iter) 21.2 ns 52 1018 ❌ BROKEN
	claude_rot (3 iter) 33.7 ns 43 909 ❌ BROKEN
	claude_rot (4 iter) 43.8 ns 44 891 ❌ BROKEN


	Chatgpt Benchmark:
	Ran it in my environment (g++ `-O3 -march=native -ffast-math`, `float`), measuring:

	* runtime: `ns/call` (median over 9 runs, 50k points)
	* accuracy: absolute distance error vs a robust double-precision reference (max / avg / p99), plus `maxEqErr = \|(x/a)^2+(y/b)^2-1\|` for the returned point.

	### Moderate (150, 100)

	\| method \| ns/call \| max dist err \| avg dist err \| p99 dist err \| maxEqErr \|
	\| ------------------ \| ------: \| -----------: \| -----------: \| -----------: \| --------: \|
	\| yours (hybrid 2+1) \| 56.295 \| 1.080e-02 \| 1.209e-05 \| 3.053e-05 \| 4.520e-07 \|
	\| yours − 3rd step \| 38.502 \| 1.879e-01 \| 2.480e-04 \| 1.169e-03 \| 4.737e-07 \|
	\| faded2 (opt) \| 38.899 \| 1.879e-01 \| 2.480e-04 \| 1.169e-03 \| 4.737e-07 \|
	\| faded3 (opt) \| 68.299 \| 8.955e-03 \| 1.152e-05 \| 3.146e-05 \| 4.419e-07 \|
	\| claude2 (rot) \| 36.109 \| 5.183e+01 \| 9.692e-01 \| 3.275e+01 \| 4.495e-07 \|
	\| claude3 (rot) \| 49.230 \| 4.905e+01 \| 9.457e-01 \| 3.277e+01 \| 4.370e-07 \|

	### High ecc (200, 50)

	\| method \| ns/call \| max dist err \| avg dist err \| p99 dist err \| maxEqErr \|
	\| ------------------ \| ------: \| -----------: \| -----------: \| -----------: \| --------: \|
	\| yours (hybrid 2+1) \| 59.147 \| 5.636e-02 \| 2.207e-05 \| 1.830e-04 \| 4.311e-07 \|
	\| yours − 3rd step \| 38.197 \| 5.382e-01 \| 9.483e-03 \| 2.230e-01 \| 4.467e-07 \|
	\| faded2 (opt) \| 39.917 \| 5.382e-01 \| 9.483e-03 \| 2.230e-01 \| 4.467e-07 \|
	\| faded3 (opt) \| 74.481 \| 3.022e-02 \| 8.965e-05 \| 2.498e-03 \| 4.407e-07 \|
	\| claude2 (rot) \| 33.235 \| 1.357e+02 \| 7.134e+00 \| 1.224e+02 \| 5.302e-07 \|
	\| claude3 (rot) \| 49.514 \| 1.375e+02 \| 6.699e+00 \| 1.232e+02 \| 4.801e-07 \|

	### Extreme (100, 10)

	\| method \| ns/call \| max dist err \| avg dist err \| p99 dist err \| maxEqErr \|
	\| ------------------ \| ------: \| -----------: \| -----------: \| -----------: \| --------: \|
	\| yours (hybrid 2+1) \| 58.911 \| 1.057e-01 \| 2.983e-05 \| 3.744e-05 \| 4.497e-07 \|
	\| yours − 3rd step \| 38.725 \| 5.581e-01 \| 1.797e-03 \| 4.816e-02 \| 4.638e-07 \|
	\| faded2 (opt) \| 40.019 \| 5.581e-01 \| 1.797e-03 \| 4.816e-02 \| 4.638e-07 \|
	\| faded3 (opt) \| 71.495 \| 1.985e-02 \| 3.397e-05 \| 7.202e-04 \| 4.628e-07 \|
	\| claude2 (rot) \| 33.035 \| 8.500e+01 \| 5.297e+00 \| 7.538e+01 \| 5.413e-07 \|
	\| claude3 (rot) \| 49.750 \| 8.260e+01 \| 4.858e+00 \| 7.578e+01 \| 4.879e-07 \|

	### Circle (100, 100)

	\| method \| ns/call \| max dist err \| avg dist err \| p99 dist err \| maxEqErr \|
	\| ------------------ \| ------: \| -----------: \| -----------: \| -----------: \| --------: \|
	\| yours (hybrid 2+1) \| 55.238 \| 3.412e-05 \| 5.876e-06 \| 1.904e-05 \| 4.422e-07 \|
	\| yours − 3rd step \| 39.317 \| 3.146e-05 \| 3.811e-06 \| 1.570e-05 \| 3.235e-07 \|
	\| faded2 (opt) \| 39.397 \| 3.146e-05 \| 3.811e-06 \| 1.570e-05 \| 3.235e-07 \|
	\| faded3 (opt) \| 70.926 \| 3.133e-05 \| 4.978e-06 \| 1.678e-05 \| 3.460e-07 \|
	\| claude2 (rot) \| 34.716 \| 3.412e-05 \| 5.096e-06 \| 1.705e-05 \| 4.538e-07 \|
	\| claude3 (rot) \| 49.998 \| 3.412e-05 \| 5.313e-06 \| 1.706e-05 \| 4.521e-07 \|

	Notes you’ll notice from the table:

	* `yours − 3rd step` and `faded2 (opt)` are the same algorithmic idea (2 iterations of the optimized curvature update), hence the near-identical stats.
	* `claude2/claude3` match the ellipse equation (low maxEqErr) but are not the closest point for non-circles (huge distance error vs robust reference).

	If you want to rerun this on your target machine:
	[Download the benchmark I used](sandbox:/mnt/data/ellipse_compare_table.cpp)


	#include <cmath>
	#include <cstdio>
	#include <chrono>
	#include <random>
	#include <vector>
	#include <algorithm>
	#include <cstdint>

	using Clock = std::chrono::high_resolution_clock;

	struct Pt { float x,y; };

	static inline float clamp01(float v){ return std::fmin(1.0f, std::fmax(0.0f, v)); }

	// -----------------------------------------------------------------------------
	// Robust reference (double): Eberly-style monotone root for t, solved by bisection
	// Ellipse: (x/a)^2 + (y/b)^2 = 1, centered at origin, axis-aligned.
	// Returns closest point on ellipse.
	// -----------------------------------------------------------------------------
	static inline Pt reference_eberly(float a, float b, float px, float py){
	const double alpha = (double)a * (double)a;
	const double beta = (double)b * (double)b;

	const double sx = (px < 0.f) ? -1.0 : 1.0;
	const double sy = (py < 0.f) ? -1.0 : 1.0;
	const double x = std::fabs((double)px);
	const double y = std::fabs((double)py);

	if (x == 0.0 && y == 0.0) {
	// Any point on ellipse is equally distant from origin; pick (0,b)
	return {0.0f, (float)(sy * (double)b)};
	}

	auto F = [&](double t){
	const double da = t + alpha;
	const double db = t + beta;
	const double termA = alpha * x * x / (da*da);
	const double termB = beta * y * y / (db*db);
	return termA + termB - 1.0;
	};

	const double insideTest = (xx)/alpha + (yy)/beta;
	const bool outside = insideTest > 1.0;

	double lo, hi;
	if (outside) {
	lo = 0.0;
	hi = 1.0;
	// Expand until F(hi) < 0
	for (int i=0;i<80 && F(hi) > 0.0; ++i) hi *= 2.0;
	} else {
	const double m = std::min(alpha, beta);
	// Avoid singularity at -m
	lo = -m + std::numeric_limits<double>::epsilon()*m;
	hi = 0.0;
	}

	// Bisection
	double t = 0.5*(lo+hi);
	for (int i=0;i<64;++i) {
	t = 0.5*(lo+hi);
	const double f = F(t);
	if (f > 0.0) lo = t;
	else hi = t;
	}

	const double cx = (alpha * x) / (t + alpha);
	const double cy = (beta * y) / (t + beta);

	return { (float)(sxcx), (float)(sycy) };
	}

	// -----------------------------------------------------------------------------
	// 0xfaded optimized curvature step (r/q combined) - parameterized iters
	// -----------------------------------------------------------------------------
	template<int ITERS>
	static inline Pt faded_opt(float a, float b, float px, float py){
	float px_abs = std::fabs(px), py_abs = std::fabs(py);
	float tx = 0.70710678f, ty = 0.70710678f;
	float a2 = aa, b2 = bb;
	float ca = (a2-b2)/a, cb = (b2-a2)/b;

	for (int i=0;i<ITERS;++i){
	float x = atx, y = bty;
	float tx3 = txtxtx, ty3 = tytyty;
	float ex = catx3, ey = cbty3;
	float rx = x - ex, ry = y - ey;
	float qx = px_abs - ex, qy = py_abs - ey;
	float rq = std::sqrt((rxrx + ryry) / (qxqx + qyqy + 1e-30f));
	tx = clamp01((qx * rq + ex) / a);
	ty = clamp01((qy * rq + ey) / b);
	float invt = 1.0f / std::sqrt(txtx + tyty);
	tx = invt; ty = invt;
	}
	return { std::copysign(atx, px), std::copysign(bty, py) };
	}

	// -----------------------------------------------------------------------------
	// Claude rotation-trick Newton - parameterized iters
	// -----------------------------------------------------------------------------
	template<int ITERS>
	static inline Pt claude_rot(float a, float b, float px, float py){
	float px_abs = std::fabs(px), py_abs = std::fabs(py);
	float a2mb2 = aa - bb;

	float nx = px_abs/a, ny = py_abs/b;
	float len = std::sqrt(nxnx + nyny + 1e-10f);
	float c = nx/len, s = ny/len;

	for (int i=0;i<ITERS;++i){
	float f = a2mb2sc - px_absas + py_absbc;
	float fp = a2mb2(cc - ss) - px_absac - py_absb*s;
	if (std::fabs(fp) < 1e-10f) break;
	float dt = f/fp;
	float nc = c + dt*s;
	float ns = s - dt*c;
	float inv = 1.0f / std::sqrt(ncnc + nsns);
	c = nc*inv;
	s = ns*inv;
	}
	return { std::copysign(ac, px), std::copysign(bs, py) };
	}

	// -----------------------------------------------------------------------------
	// "Yours" hybrid: 2 curvature steps + 1 Newton snap
	// -----------------------------------------------------------------------------
	static inline Pt yours_hybrid(float a, float b, float px, float py){
	float px_abs = std::fabs(px), py_abs = std::fabs(py);
	float a2 = aa, b2 = bb;
	float ca = (a2-b2)/a, cb = (b2-a2)/b;
	float a2mb2 = a2 - b2;

	float tx = 0.70710678f, ty = 0.70710678f;

	// 2 curvature iterations
	for (int i=0;i<2;++i){
	float x = atx, y = bty;
	float tx3 = txtxtx, ty3 = tytyty;
	float ex = catx3, ey = cbty3;
	float rx = x - ex, ry = y - ey;
	float qx = px_abs - ex, qy = py_abs - ey;
	float rq = std::sqrt((rxrx + ryry) / (qxqx + qyqy + 1e-30f));
	tx = clamp01((qx * rq + ex) / a);
	ty = clamp01((qy * rq + ey) / b);
	float invt = 1.0f / std::sqrt(txtx + tyty);
	tx = invt; ty = invt;
	}

	// 1 Newton snap using (tx,ty) as (cos,sin)
	float c = tx, s = ty;
	float f = a2mb2sc - px_absas + py_absbc;
	float fp = a2mb2(cc - ss) - px_absac - py_absb*s;
	if (std::fabs(fp) > 1e-10f) {
	float dt = f/fp;
	float nc = c + dt*s;
	float ns = s - dt*c;
	float inv = 1.0f / std::sqrt(ncnc + nsns);
	c = nc*inv;
	s = ns*inv;
	}

	return { std::copysign(ac, px), std::copysign(bs, py) };
	}

	// "yours minus third step" = just 2 curvature (same as faded_opt<2>)
	static inline Pt yours_minus_third(float a, float b, float px, float py){
	return faded_opt<2>(a,b,px,py);
	}

	volatile float sink;
	static inline void escape(Pt p){ sink = p.x + p.y; }

	struct Acc {
	double max_dist_err;
	double avg_dist_err;
	double p99_dist_err;
	double max_eq_err;
	};

	static inline double distf(float x, float y){ return std::sqrt((double)x(double)x + (double)y(double)y); }

	template<typename F>
	static Acc measure_accuracy(F fn, float a, float b, const std::vector<Pt>& pts){
	std::vector<double> errs;
	errs.reserve(pts.size());
	double max_eq = 0.0;

	const double invA = 1.0 / (double)a;
	const double invB = 1.0 / (double)b;

	for (const auto& p: pts){
	Pt got = fn(a,b,p.x,p.y);
	Pt ref = reference_eberly(a,b,p.x,p.y);

	// eq error of got
	double u = (double)got.x * invA;
	double v = (double)got.y * invB;
	double eq = std::fabs(uu + vv - 1.0);
	if (std::isfinite(eq)) max_eq = std::max(max_eq, eq);

	double dg = distf(p.x - got.x, p.y - got.y);
	double dr = distf(p.x - ref.x, p.y - ref.y);
	double de = std::fabs(dg - dr);
	if (std::isfinite(de)) errs.push_back(de);
	}

	std::sort(errs.begin(), errs.end());
	double sum = 0.0;
	for (double e: errs) sum += e;

	Acc out{};
	out.max_eq_err = max_eq;
	out.max_dist_err = errs.empty() ? 0.0 : errs.back();
	out.avg_dist_err = errs.empty() ? 0.0 : sum / (double)errs.size();
	out.p99_dist_err = errs.empty() ? 0.0 : errs[(size_t)((errs.size()-1) * 0.99)];
	return out;
	}

	template<typename F>
	static double bench_ns(F fn, float a, float b, const std::vector<Pt>& pts, int runs=9){
	// warmup
	for(int w=0; w<3; ++w) for(const auto& p: pts) escape(fn(a,b,p.x,p.y));
	std::vector<double> times;
	times.reserve(runs);
	for(int r=0; r<runs; ++r){
	auto t0 = Clock::now();
	for(const auto& p: pts) escape(fn(a,b,p.x,p.y));
	auto t1 = Clock::now();
	double ns = std::chrono::duration<double,std::nano>(t1-t0).count();
	times.push_back(ns / (double)pts.size());
	}
	std::sort(times.begin(), times.end());
	return times[times.size()/2];
	}

	struct Row { const char* name; double ns; Acc acc; };

	int main(){
	std::mt19937 rng(12345);
	std::uniform_real_distribution<float> ang(0.0f, 2.0f*(float)M_PI);
	std::uniform_real_distribution<float> rad(0.1f, 3.0f);

	const int N_time = 50000;
	const int N_acc = 20000;

	struct Cfg { float a,b; const char* name; } cfgs[] = {
	{150,100,"Moderate (150,100)"},
	{200, 50,"High ecc (200,50)"},
	{100, 10,"Extreme (100,10)"},
	{100,100,"Circle (100,100)"},
	};

	auto gen_pts = [&](float a,float b,int N){
	std::vector<Pt> pts; pts.reserve(N);
	for(int i=0;i<N;++i){
	float t = ang(rng);
	float r = rad(rng);
	pts.push_back({arstd::cos(t), brstd::sin(t)});
	}
	return pts;
	};

	for(const auto& cfg: cfgs){
	auto pts_time = gen_pts(cfg.a,cfg.b,N_time);
	auto pts_acc = gen_pts(cfg.a,cfg.b,N_acc);

	Row rows[6];

	rows[0] = {"yours (hybrid 2+1)", bench_ns(yours_hybrid, cfg.a,cfg.b, pts_time), measure_accuracy(yours_hybrid, cfg.a,cfg.b, pts_acc)};
	rows[1] = {"yours - 3rd step", bench_ns(yours_minus_third, cfg.a,cfg.b, pts_time), measure_accuracy(yours_minus_third, cfg.a,cfg.b, pts_acc)};
	rows[2] = {"faded2 (opt)", bench_ns(faded_opt<2>, cfg.a,cfg.b, pts_time), measure_accuracy(faded_opt<2>, cfg.a,cfg.b, pts_acc)};
	rows[3] = {"faded3 (opt)", bench_ns(faded_opt<3>, cfg.a,cfg.b, pts_time), measure_accuracy(faded_opt<3>, cfg.a,cfg.b, pts_acc)};
	rows[4] = {"claude2 (rot)", bench_ns(claude_rot<2>, cfg.a,cfg.b, pts_time), measure_accuracy(claude_rot<2>, cfg.a,cfg.b, pts_acc)};
	rows[5] = {"claude3 (rot)", bench_ns(claude_rot<3>, cfg.a,cfg.b, pts_time), measure_accuracy(claude_rot<3>, cfg.a,cfg.b, pts_acc)};

	std::printf("CONFIG %s\n", cfg.name);
	std::printf("name\t\tns_per_call\tmaxDistErr\tavgDistErr\tp99DistErr\tmaxEqErr\n");
	for(const auto& row: rows){
	std::printf("%s\t%.3f\t%.3e\t%.3e\t%.3e\t%.3e\n",
	row.name, row.ns, row.acc.max_dist_err, row.acc.avg_dist_err, row.acc.p99_dist_err, row.acc.max_eq_err);
	}
	std::printf("\n");
	}

	return 0;
	}
No results found