vurtun/anim_sample.c

## anim_sample.c
#include <immintrin.h>  // AVX, includes SSE2-SSE4.2
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>

#define MAX_ANIM            1024
#define MAX_ANM_CLIPS       256
#define BATCH_SIZE          4
#define MAX_JOINTS          256
#define ANIM_TRK_ELM_CNT    7  // x, y, z, scl, rot_x, rot_y, rot_z

#define align32 __attribute__((aligned(32)))

// Animation track types
enum anm_trks {
  ANM_TRK_POS,  // 3 components (x, y, z)
  ANM_TRK_ROT,  // 1 quaternion (32-bit encoded)
  ANM_TRK_SCL,  // 1 component (uniform scale)
  ANM_TRK_CNT
};
struct anm_clip {
  int joint_cnt;
  int frame_cnt;          // Total number of frames
  float off[ANM_TRK_CNT]; // Quantization offset factors for pos, rot, scl
  float scl[ANM_TRK_CNT]; // Quantization scaling factors for pos, rot, scl
  struct {
    align32 unsigned short const *pos_x;
    align32 unsigned short const *pos_y;
    align32 unsigned short const *pos_z;
    align32 unsigned const *rot_pos;
    align32 unsigned short const *scl_s;
  } keys;
  struct {
    align32 unsigned short const *frame_to_key_pos;
    align32 unsigned short const *frame_to_key_rot;
    align32 unsigned short const *frame_to_key_scl;
  } blks;
};
struct anm_cache {
  align32 float pos_x[2 * MAX_JOINTS];
  align32 float pos_y[2 * MAX_JOINTS];
  align32 float pos_z[2 * MAX_JOINTS];
  align32 float scl_s[2 * MAX_JOINTS];
  align32 float rot_x[2 * MAX_JOINTS];
  align32 float rot_y[2 * MAX_JOINTS];
  align32 float rot_z[2 * MAX_JOINTS];
  align32 float rot_w[2 * MAX_JOINTS];

  int cached_anim;
  int cached_frame;
  int cached_next;
  int prev_slot;
};
struct anm_sys {
  short free_idx_cnt;
  short free_idx[MAX_ANIM];

  align32 int anim[MAX_ANIM];
  align32 float exact_frame[MAX_ANIM];
  align32 int frame[MAX_ANIM];
  align32 int frame_nxt[MAX_ANIM];
  align32 int frame_cnt[MAX_ANIM];
  align32 float t_0[MAX_ANIM];
  align32 float t_1[MAX_ANIM];
  struct anm_cache caches[MAX_ANIM];
};
static struct anm_sys anm_sys;

enum anm_clip_id {
    ANM_CLIP_DEFAULT,
    ANM_CLIP_CNT,
};
static const align32 unsigned short anm_def_pos_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_pos_y[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_pos_z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned anm_def_rot[] = {
  0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000,
  0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000
};
static const align32 unsigned short anm_def_scl[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
static const align32 unsigned short anm_def_frame_to_key_pos[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_frame_to_key_rot[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_frame_to_key_scl[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static const struct anm_clip anm_def_clip = {
  .joint_cnt = 8,
  .frame_cnt = 1,
  .off = {0.0f, 0.0f, 0.0f},
  .scl = {1.0f, 1.0f, 1.0f},
  .keys = {
    .pos_x = anm_def_pos_x,
    .pos_y = anm_def_pos_y,
    .pos_z = anm_def_pos_z,
    .rot_pos = anm_def_rot,
    .scl_s = anm_def_scl,
  },
  .blks = {
    .frame_to_key_pos = anm_def_frame_to_key_pos,
    .frame_to_key_rot = anm_def_frame_to_key_rot,
    .frame_to_key_scl = anm_def_frame_to_key_scl,
  },
};
static const struct anm_clip *anm_clips[ANM_CLIP_CNT] = {
  [ANM_CLIP_DEFAULT] = &anm_def_clip,
};
extern void
anm_init(struct anm_sys *ans) {
  ans->free_idx_cnt = MAX_ANIM;
  for (int i = 0; i < MAX_ANIM; ++i) {
    ans->free_idx[i] = MAX_ANIM - i - 1;
  }
  for (int i = 0; i < MAX_ANM_CLIPS; ++i) {
    ans->caches[i].cached_anim = -1;
  }
}
extern int
anm_add(struct anm_sys *ans, enum anm_clip_id clip_id) {
  const struct anm_clip *clip = anm_clips[clip_id];
  int anm = ans->free_idx[--ans->free_idx_cnt];
  ans->anim[anm] = clip_id;
  ans->frame[anm] = 0;
  ans->frame_nxt[anm] = 1;
  ans->exact_frame[anm] = 0.0f;
  ans->frame_cnt[anm] = clip->frame_cnt;
  ans->t_0[anm] = 0.0f;
  ans->t_1[anm] = 0.0f;
  return anm;
}
extern void
anm_del(struct anm_sys *ans, int anm) {
  ans->free_idx[ans->free_idx_cnt++] = anm;
  ans->anim[anm] = 0;
  ans->frame[anm] = 0;
  ans->frame_nxt[anm] = 0;
  ans->exact_frame[anm] = 0.0f;
  ans->frame_cnt[anm] = 0;
  ans->t_0[anm] = 0.0f;
  ans->t_1[anm] = 0.0f;
}
static inline void
qdec_avx(__m256 *qout_x, __m256 *qout_y, __m256 *qout_z, __m256 *qout_w, const unsigned *qin) {
  __m256 half = _mm256_set1_ps(0.707106781f);
  __m256 inv_msk = _mm256_set1_ps(1.0f / 511.0f);
  __m256 one = _mm256_set1_ps(1.0f);
  __m256 zero = _mm256_setzero_ps();

  __m256i q = _mm256_load_si256((__m256i*)qin);
  __m256i top = _mm256_srli_epi32(q, 30);
  __m256i mask = _mm256_set1_epi32(511);

  // Component 0 (highest non-top)
  __m256i mag0 = _mm256_and_si256(q, mask);
  __m256i negbit0 = _mm256_and_si256(_mm256_srli_epi32(q, 9), _mm256_set1_epi32(1));
  q = _mm256_srli_epi32(q, 10);
  __m256 pos_val0 = _mm256_mul_ps(_mm256_cvtepi32_ps(mag0), _mm256_mul_ps(inv_msk, half));
  __m256 sqr = _mm256_fmadd_ps(pos_val0, pos_val0, zero);
  __m256 neg_mask0 = _mm256_cvtepi32_ps(negbit0);
  __m256 val0 = _mm256_blendv_ps(pos_val0, _mm256_sub_ps(zero, pos_val0), neg_mask0);

  // Component 1 (mid non-top)
  __m256i mag1 = _mm256_and_si256(q, mask);
  __m256i negbit1 = _mm256_and_si256(_mm256_srli_epi32(q, 9), _mm256_set1_epi32(1));
  q = _mm256_srli_epi32(q, 10);
  __m256 pos_val1 = _mm256_mul_ps(_mm256_cvtepi32_ps(mag1), _mm256_mul_ps(inv_msk, half));
  sqr = _mm256_fmadd_ps(pos_val1, pos_val1, sqr);
  __m256 neg_mask1 = _mm256_cvtepi32_ps(negbit1);
  __m256 val1 = _mm256_blendv_ps(pos_val1, _mm256_sub_ps(zero, pos_val1), neg_mask1);

  // Component 2 (lowest non-top)
  __m256i mag2 = _mm256_and_si256(q, mask);
  __m256i negbit2 = _mm256_and_si256(_mm256_srli_epi32(q, 9), _mm256_set1_epi32(1));
  q = _mm256_srli_epi32(q, 10);
  __m256 pos_val2 = _mm256_mul_ps(_mm256_cvtepi32_ps(mag2), _mm256_mul_ps(inv_msk, half));
  sqr = _mm256_fmadd_ps(pos_val2, pos_val2, sqr);
  __m256 neg_mask2 = _mm256_cvtepi32_ps(negbit2);
  __m256 val2 = _mm256_blendv_ps(pos_val2, _mm256_sub_ps(zero, pos_val2), neg_mask2);

  // Missing component
  __m256 diff = _mm256_max_ps(_mm256_sub_ps(one, sqr), zero);
  __m256 root = _mm256_sqrt_ps(diff);

  __m256 mask_eq0 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(0)));
  __m256 mask_eq1 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(1)));
  __m256 mask_eq2 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(2)));
  __m256 mask_eq3 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(3)));

  // Assign based on top (val0: highest non-top, val1: mid, val2: lowest non-top)
  *qout_x = _mm256_blendv_ps(val2, root, mask_eq0);
  __m256 temp_y = _mm256_blendv_ps(val1, val2, mask_eq0);
  *qout_y = _mm256_blendv_ps(temp_y, root, mask_eq1);
  __m256 temp_z_nonroot = _mm256_blendv_ps(val0, val1, _mm256_or_ps(mask_eq0, mask_eq1));
  *qout_z = _mm256_blendv_ps(temp_z_nonroot, root, mask_eq2);
  *qout_w = _mm256_blendv_ps(val0, root, mask_eq3);
}
extern void
anim_update(float* out, struct anm_sys *ans, int anm_start, int anm_end) {
  {
    // Precompute linear coefficients and update frames for all models
    __m256 zero = _mm256_setzero_ps();
    __m256 one = _mm256_set1_ps(1.0f);
    __m256i one_i = _mm256_set1_epi32(1);
    __m256i zero_i = _mm256_setzero_si256();  // For lower clamp

    for (int i = anm_start; i < anm_end; i += 8) {
      __m256i frame_cnt = _mm256_load_si256((__m256i*)&ans->frame_cnt[i]);
      __m256 exact = _mm256_load_ps(&ans->exact_frame[i]);
      __m256i frame_i = _mm256_castps_si256(exact);  // trunc toward zero
      __m256i frame_cnt_m1 = _mm256_sub_epi32(frame_cnt, one_i);
      __m256 frame_ps = _mm256_cvtepi32_ps(frame_i);
      __m256 frame_cnt_m1_ps = _mm256_cvtepi32_ps(frame_cnt_m1);
      __m256 gt_mask = _mm256_cmp_ps(frame_ps, frame_cnt_m1_ps, _CMP_GT_OQ);
      __m256i clamp_mask_i = _mm256_castps_si256(gt_mask);
      frame_i = _mm256_blendv_epi8(frame_i, frame_cnt_m1, clamp_mask_i);
      __m256 lt_mask = _mm256_cmp_ps(frame_ps, zero, _CMP_LT_OQ);  // < 0?
      __m256i lt_mask_i = _mm256_castps_si256(lt_mask);
      frame_i = _mm256_blendv_epi8(frame_i, zero_i, lt_mask_i);
      __m256 t = _mm256_sub_ps(exact, _mm256_cvtepi32_ps(frame_i));

      __m256i frame_1_tst = _mm256_add_epi32(frame_i, one_i);
      __m256 tst_ps = _mm256_cvtepi32_ps(frame_1_tst);
      __m256 cnt_ps = _mm256_cvtepi32_ps(frame_cnt);
      __m256 ge_mask = _mm256_cmp_ps(tst_ps, cnt_ps, _CMP_GE_OQ);
      __m256i sel_mask_i = _mm256_castps_si256(ge_mask);
      __m256i frame_1 = _mm256_blendv_epi8(frame_1_tst, frame_i, sel_mask_i);

      __m256 t_clamped = _mm256_min_ps(_mm256_max_ps(t, zero), one);

      // Compute linear basis functions
      __m256 h00 = _mm256_sub_ps(one, t_clamped);
      __m256 h01 = t_clamped;

      // Store
      _mm256_store_ps(&ans->t_0[i], h00);
      _mm256_store_ps(&ans->t_1[i], h01);
      _mm256_store_si256((__m256i*)&ans->frame_nxt[i], frame_1);
      _mm256_store_si256((__m256i*)&ans->frame[i], frame_i);
    }
  }
  for (int m = anm_start; m < anm_end; ++m) {
    int anm_id = ans->anim[m];
    const struct anm_clip *clp = anm_clips[anm_id];
    int frame = ans->frame[m];
    int frame_next = ans->frame_nxt[m];

    int key_idx_pos[2] = {clp->blks.frame_to_key_pos[frame], clp->blks.frame_to_key_pos[frame_next]};
    int key_idx_rot[2] = {clp->blks.frame_to_key_rot[frame], clp->blks.frame_to_key_rot[frame_next]};
    int key_idx_scl[2] = {clp->blks.frame_to_key_scl[frame], clp->blks.frame_to_key_scl[frame_next]};

    const __m256 pos_off = _mm256_set1_ps(clp->off[ANM_TRK_POS]);
    const __m256 rot_off = _mm256_set1_ps(clp->off[ANM_TRK_ROT]);
    const __m256 scl_off = _mm256_set1_ps(clp->off[ANM_TRK_SCL]);

    const __m256 pos_scl = _mm256_set1_ps(clp->scl[ANM_TRK_POS]);
    const __m256 rot_scl = _mm256_set1_ps(clp->scl[ANM_TRK_ROT]);
    const __m256 scl_scl = _mm256_set1_ps(clp->scl[ANM_TRK_SCL]);

    __m256 h00 = _mm256_set1_ps(ans->t_0[m]);
    __m256 h01 = _mm256_set1_ps(ans->t_1[m]);

    const int stride = clp->joint_cnt;
    float *out_ptr = out + m * ANIM_TRK_ELM_CNT * stride;

    int anm_mod = ans->caches[m].cached_anim == anm_id;
    int full_hit = (ans->caches[m].cached_frame == frame && ans->caches[m].cached_next == frame_next);
    int advance = !full_hit && (ans->caches[m].cached_frame != -1) && (frame == ans->caches[m].cached_next);

    if (anm_mod || !full_hit) {
       if (!anm_mod && advance) {
        // Double-buffer toggle: Swap slots (new prev = old next; new next = old prev)
        ans->caches[m].prev_slot = 1 - ans->caches[m].prev_slot;
        ans->caches[m].cached_frame = frame;
        ans->caches[m].cached_next = frame_next;

        // Dequant only new frame_next to new next_slot
        int next_slot = 1 - ans->caches[m].prev_slot;  // Now the free slot (old prev)
        int base_pos1 = key_idx_pos[1] * stride;
        int base_scl1 = key_idx_scl[1] * stride;
        int base_rot1 = key_idx_rot[1] * stride;

        for (int jj = 0; jj < clp->joint_cnt; jj += 8) {
          int j1 = jj;  // Batch offset
          // Position frame 1 (new next)
          __m128i shorts1_x = _mm_load_si128((__m128i*)(clp->keys.pos_x + base_pos1 + j1));
          __m256i mag1_x = _mm256_cvtepu16_epi32(shorts1_x);
          __m256 pos1_x = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_x), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_x[next_slot * stride + jj], pos1_x);

          __m128i shorts1_y = _mm_load_si128((__m128i*)(clp->keys.pos_y + base_pos1 + j1));
          __m256i mag1_y = _mm256_cvtepu16_epi32(shorts1_y);
          __m256 pos1_y = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_y), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_y[next_slot * stride + jj], pos1_y);

          __m128i shorts1_z = _mm_load_si128((__m128i*)(clp->keys.pos_z + base_pos1 + j1));
          __m256i mag1_z = _mm256_cvtepu16_epi32(shorts1_z);
          __m256 pos1_z = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_z), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_z[next_slot * stride + jj], pos1_z);

          // Scale frame 1 (new next)
          __m128i shorts_s1 = _mm_load_si128((__m128i*)(clp->keys.scl_s + base_scl1 + j1));
          __m256i mag_s1 = _mm256_cvtepu16_epi32(shorts_s1);
          __m256 scl1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag_s1), scl_scl), scl_off);
          _mm256_store_ps(&ans->caches[m].scl_s[next_slot * stride + jj], scl1);

          // Rotation frame 1 (new next)
          align32 unsigned rot1_data[8];
          __m256i rot1_packed = _mm256_load_si256((__m256i*)(clp->keys.rot_pos + base_rot1 + j1));
          _mm256_store_si256((__m256i*)rot1_data, rot1_packed);
          __m256 rot1_x_raw, rot1_y_raw, rot1_z_raw, rot1_w_raw;
          qdec_avx(&rot1_x_raw, &rot1_y_raw, &rot1_z_raw, &rot1_w_raw, rot1_data);

          __m256 rot1_x = _mm256_add_ps(_mm256_mul_ps(rot1_x_raw, rot_scl), rot_off);
          _mm256_store_ps(&ans->caches[m].rot_x[next_slot * stride + jj], rot1_x);
          __m256 rot1_y = _mm256_add_ps(_mm256_mul_ps(rot1_y_raw, rot_scl), rot_off);
          _mm256_store_ps(&ans->caches[m].rot_y[next_slot * stride + jj], rot1_y);
          __m256 rot1_z = _mm256_add_ps(_mm256_mul_ps(rot1_z_raw, rot_scl), rot_off);
          _mm256_store_ps(&ans->caches[m].rot_z[next_slot * stride + jj], rot1_z);
          __m256 rot1_w = _mm256_add_ps(_mm256_mul_ps(rot1_w_raw, rot_scl), rot_off);
          _mm256_store_ps(&ans->caches[m].rot_w[next_slot * stride + jj], rot1_w);
        }
      } else {
        // Full miss/seek: Dequant both to fixed slots 0 (frame) / 1 (frame_next)
        ans->caches[m].prev_slot = 0;
        ans->caches[m].cached_frame = frame;
        ans->caches[m].cached_next = frame_next;

        // Dequant full pair to cache (slot 0: frame, slot 1: frame_next)
        for (int jj = 0; jj < clp->joint_cnt; jj += 8) {
          int base_pos0 = key_idx_pos[0] * stride + jj;
          int base_pos1 = key_idx_pos[1] * stride + jj;
          int base_scl0 = key_idx_scl[0] * stride + jj;
          int base_scl1 = key_idx_scl[1] * stride + jj;
          int base_rot0 = key_idx_rot[0] * stride + jj;
          int base_rot1 = key_idx_rot[1] * stride + jj;

          // Position frame 0 (slot 0)
          __m128i shorts0_x = _mm_load_si128((__m128i*)(clp->keys.pos_x + base_pos0));
          __m256i mag0_x = _mm256_cvtepu16_epi32(shorts0_x);
          __m256 pos0_x = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag0_x), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_x[0 * stride + jj], pos0_x);

          __m128i shorts0_y = _mm_load_si128((__m128i*)(clp->keys.pos_y + base_pos0));
          __m256i mag0_y = _mm256_cvtepu16_epi32(shorts0_y);
          __m256 pos0_y = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag0_y), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_y[0 * stride + jj], pos0_y);

          __m128i shorts0_z = _mm_load_si128((__m128i*)(clp->keys.pos_z + base_pos0));
          __m256i mag0_z = _mm256_cvtepu16_epi32(shorts0_z);
          __m256 pos0_z = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag0_z), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_z[0 * stride + jj], pos0_z);

          // Position frame 1 (slot 1)
          __m128i shorts1_x = _mm_load_si128((__m128i*)(clp->keys.pos_x + base_pos1));
          __m256i mag1_x = _mm256_cvtepu16_epi32(shorts1_x);
          __m256 pos1_x = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_x), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_x[1 * stride + jj], pos1_x);

          __m128i shorts1_y = _mm_load_si128((__m128i*)(clp->keys.pos_y + base_pos1));
          __m256i mag1_y = _mm256_cvtepu16_epi32(shorts1_y);
          __m256 pos1_y = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_y), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_y[1 * stride + jj], pos1_y);

          __m128i shorts1_z = _mm_load_si128((__m128i*)(clp->keys.pos_z + base_pos1));
          __m256i mag1_z = _mm256_cvtepu16_epi32(shorts1_z);
          __m256 pos1_z = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_z), pos_scl), pos_off);
          _mm256_store_ps(&ans->caches[m].pos_z[1 * stride + jj], pos1_z);

          // Scale frame 0 (slot 0)
          __m128i shorts_s0 = _mm_load_si128((__m128i*)(clp->keys.scl_s + base_scl0));
          __m256i mag_s0 = _mm256_cvtepu16_epi32(shorts_s0);
          __m256 scl0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag_s0), scl_scl), scl_off);
          _mm256_store_ps(&ans->caches[m].scl_s[0 * stride + jj], scl0);

          // Scale frame 1 (slot 1)
          __m128i shorts_s1 = _mm_load_si128((__m128i*)(clp->keys.scl_s + base_scl1));
          __m256i mag_s1 = _mm256_cvtepu16_epi32(shorts_s1);
          __m256 scl1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag_s1), scl_scl), scl_off);
          _mm256_store_ps(&ans->caches[m].scl_s[1 * stride + jj], scl1);

          // Rotation frame 0 (slot 0)
          align32 unsigned rot0_data[8];
          __m256i rot0_packed = _mm256_load_si256((__m256i*)(clp->keys.rot_pos + base_rot0));
          _mm256_store_si256((__m256i*)rot0_data, rot0_packed);
          __m256 rot0_x_raw, rot0_y_raw, rot0_z_raw, rot0_w_raw;
          qdec_avx(&rot0_x_raw, &rot0_y_raw, &rot0_z_raw, &rot0_w_raw, rot0_data);

          __m256 rot0_x = _mm256_add_ps(_mm256_mul_ps(rot0_x_raw, rot_scl), rot_off);
          __m256 rot0_y = _mm256_add_ps(_mm256_mul_ps(rot0_y_raw, rot_scl), rot_off);
          __m256 rot0_z = _mm256_add_ps(_mm256_mul_ps(rot0_z_raw, rot_scl), rot_off);
          __m256 rot0_w = _mm256_add_ps(_mm256_mul_ps(rot0_w_raw, rot_scl), rot_off);

          _mm256_store_ps(&ans->caches[m].rot_x[0 * stride + jj], rot0_x);
          _mm256_store_ps(&ans->caches[m].rot_y[0 * stride + jj], rot0_y);
          _mm256_store_ps(&ans->caches[m].rot_z[0 * stride + jj], rot0_z);
          _mm256_store_ps(&ans->caches[m].rot_w[0 * stride + jj], rot0_w);

          // Rotation frame 1 (slot 1)
          align32 unsigned rot1_data[8];
          __m256i rot1_packed = _mm256_load_si256((__m256i*)(clp->keys.rot_pos + base_rot1));
          _mm256_store_si256((__m256i*)rot1_data, rot1_packed);

          __m256 rot1_x_raw, rot1_y_raw, rot1_z_raw, rot1_w_raw;
          qdec_avx(&rot1_x_raw, &rot1_y_raw, &rot1_z_raw, &rot1_w_raw, rot1_data);
          __m256 rot1_x = _mm256_add_ps(_mm256_mul_ps(rot1_x_raw, rot_scl), rot_off);
          __m256 rot1_y = _mm256_add_ps(_mm256_mul_ps(rot1_y_raw, rot_scl), rot_off);
          __m256 rot1_z = _mm256_add_ps(_mm256_mul_ps(rot1_z_raw, rot_scl), rot_off);
          __m256 rot1_w = _mm256_add_ps(_mm256_mul_ps(rot1_w_raw, rot_scl), rot_off);

          _mm256_store_ps(&ans->caches[m].rot_x[1 * stride + jj], rot1_x);
          _mm256_store_ps(&ans->caches[m].rot_y[1 * stride + jj], rot1_y);
          _mm256_store_ps(&ans->caches[m].rot_z[1 * stride + jj], rot1_z);
          _mm256_store_ps(&ans->caches[m].rot_w[1 * stride + jj], rot1_w);
        }
      }
    }
    int next_slot = 1 - ans->caches[m].prev_slot;
    int prev_idx_base = ans->caches[m].prev_slot * stride;
    int next_idx_base = next_slot * stride;
    float t = ans->exact_frame[m] - ans->frame[m];

    // Joint loop: Use dynamic indices for loads
    for (int j = 0; j < clp->joint_cnt; j += 8) {
      int prev_idx = prev_idx_base + j;
      int next_idx = next_idx_base + j;

      // Position
      __m256 pos0_x = _mm256_load_ps(&ans->caches[m].pos_x[prev_idx]);
      __m256 pos1_x = _mm256_load_ps(&ans->caches[m].pos_x[next_idx]);

      __m256 pos0_y = _mm256_load_ps(&ans->caches[m].pos_y[prev_idx]);
      __m256 pos1_y = _mm256_load_ps(&ans->caches[m].pos_y[next_idx]);

      __m256 pos0_z = _mm256_load_ps(&ans->caches[m].pos_z[prev_idx]);
      __m256 pos1_z = _mm256_load_ps(&ans->caches[m].pos_z[next_idx]);

      // Scale
      __m256 scl0 = _mm256_load_ps(&ans->caches[m].scl_s[prev_idx]);
      __m256 scl1 = _mm256_load_ps(&ans->caches[m].scl_s[next_idx]);

      // Rotation
      __m256 rot0_x = _mm256_load_ps(&ans->caches[m].rot_x[prev_idx]);
      __m256 rot1_x = _mm256_load_ps(&ans->caches[m].rot_x[next_idx]);

      __m256 rot0_y = _mm256_load_ps(&ans->caches[m].rot_y[prev_idx]);
      __m256 rot1_y = _mm256_load_ps(&ans->caches[m].rot_y[next_idx]);

      __m256 rot0_z = _mm256_load_ps(&ans->caches[m].rot_z[prev_idx]);
      __m256 rot1_z = _mm256_load_ps(&ans->caches[m].rot_z[next_idx]);

      __m256 rot0_w = _mm256_load_ps(&ans->caches[m].rot_w[prev_idx]);
      __m256 rot1_w = _mm256_load_ps(&ans->caches[m].rot_w[next_idx]);

      // Linear interpolation for position and scale
      __m256 res_x = _mm256_fmadd_ps(h01, pos1_x, _mm256_mul_ps(h00, pos0_x));
      __m256 res_y = _mm256_fmadd_ps(h01, pos1_y, _mm256_mul_ps(h00, pos0_y));
      __m256 res_z = _mm256_fmadd_ps(h01, pos1_z, _mm256_mul_ps(h00, pos0_z));
      __m256 res_scl = _mm256_fmadd_ps(h01, scl1, _mm256_mul_ps(h00, scl0));

      // ONLERP for rotation
      __m256 zero = _mm256_setzero_ps();
      __m256 one = _mm256_set1_ps(1.0f);
      __m256 tt = _mm256_set1_ps(t);
      __m256 ca = _mm256_add_ps(
          _mm256_add_ps(_mm256_mul_ps(rot0_x, rot1_x), _mm256_mul_ps(rot0_y, rot1_y)),
          _mm256_add_ps(_mm256_mul_ps(rot0_z, rot1_z), _mm256_mul_ps(rot0_w, rot1_w)));

      __m256 mask_neg = _mm256_cmp_ps(ca, zero, _CMP_LT_OS);
      __m256 fabs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
      __m256 d = _mm256_and_ps(ca, fabs_mask);

      __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
      rot1_x = _mm256_xor_ps(rot1_x, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
      rot1_y = _mm256_xor_ps(rot1_y, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
      rot1_z = _mm256_xor_ps(rot1_z, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
      rot1_w = _mm256_xor_ps(rot1_w, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));

      // Compute A = 1.0904 + d * (-3.2452 + d * (3.55645 - d * 1.43519))
      __m256 tmp_a1 = _mm256_fmadd_ps(d, _mm256_set1_ps(-1.43519f), _mm256_set1_ps(3.55645f));
      __m256 tmp_a2 = _mm256_fmadd_ps(d, tmp_a1, _mm256_set1_ps(-3.2452f));
      __m256 A = _mm256_fmadd_ps(d, tmp_a2, _mm256_set1_ps(1.0904f));

      // Compute B = 0.848013 + d * (-1.06021 + d * 0.215638)
      __m256 tmp_b1 = _mm256_fmadd_ps(d, _mm256_set1_ps(0.215638f), _mm256_set1_ps(-1.06021f));
      __m256 B = _mm256_fmadd_ps(d, tmp_b1, _mm256_set1_ps(0.848013f));

      __m256 dt05 = _mm256_sub_ps(tt, _mm256_set1_ps(0.5f));
      __m256 dt05_sq = _mm256_mul_ps(dt05, dt05);
      __m256 k = _mm256_fmadd_ps(A, dt05_sq, B);

      __m256 t_m05 = _mm256_sub_ps(tt, _mm256_set1_ps(0.5f));
      __m256 t_m1 = _mm256_sub_ps(tt, one);
      __m256 term = _mm256_mul_ps(tt, _mm256_mul_ps(t_m05, t_m1));
      __m256 ot = _mm256_fmadd_ps(k, term, tt);

      __m256 lt = _mm256_sub_ps(one, ot);
      __m256 res_rot_x = _mm256_fmadd_ps(ot, rot1_x, _mm256_mul_ps(lt, rot0_x));
      __m256 res_rot_y = _mm256_fmadd_ps(ot, rot1_y, _mm256_mul_ps(lt, rot0_y));
      __m256 res_rot_z = _mm256_fmadd_ps(ot, rot1_z, _mm256_mul_ps(lt, rot0_z));
      __m256 res_rot_w = _mm256_fmadd_ps(ot, rot1_w, _mm256_mul_ps(lt, rot0_w));

      // Normalize
      __m256 un = _mm256_fmadd_ps(res_rot_w, res_rot_w,
                _mm256_fmadd_ps(res_rot_z, res_rot_z,
                _mm256_fmadd_ps(res_rot_y, res_rot_y,
                _mm256_mul_ps(res_rot_x, res_rot_x))));
      __m256 us0 = _mm256_rsqrt_ps(un);
      __m256 us1 = _mm256_mul_ps(_mm256_mul_ps(_mm256_set1_ps(0.5f), us0),
                _mm256_sub_ps(_mm256_set1_ps(3.0f),
                _mm256_mul_ps(_mm256_mul_ps(us0, us0), un)));
      res_rot_x = _mm256_mul_ps(res_rot_x, us1);
      res_rot_y = _mm256_mul_ps(res_rot_y, us1);
      res_rot_z = _mm256_mul_ps(res_rot_z, us1);
      res_rot_w = _mm256_mul_ps(res_rot_w, us1);  // FIX: Scale w component too

      // Stream
      _mm256_stream_ps(out_ptr + 0 * stride + j, res_x);
      _mm256_stream_ps(out_ptr + 1 * stride + j, res_y);
      _mm256_stream_ps(out_ptr + 2 * stride + j, res_z);
      _mm256_stream_ps(out_ptr + 3 * stride + j, res_scl);
      _mm256_stream_ps(out_ptr + 4 * stride + j, res_rot_x);
      _mm256_stream_ps(out_ptr + 5 * stride + j, res_rot_y);
      _mm256_stream_ps(out_ptr + 6 * stride + j, res_rot_z);
    }
  }
  _mm_sfence(); // Ensure all streaming stores complete
}

## gltf2c_anim.c
#include "cgltf.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <float.h>
#include <stdbool.h>
#include <limits.h>
#include <stdint.h>

#define MAX_JOINTS 256
#define MAX_ALL_JOINTS 1024
#define MAX_FRAMES (16*1024)
#define DIM_MAX 4
#define MAT4_SIZE 16

typedef struct {
  uint64_t bits[MAX_FRAMES/64];  // 256*64 = 16384 bits
} bitset_t;

typedef struct {
  int start;
  int end;
} interval_t;

typedef struct {
  float tx, ty, tz;
  float rx, ry, rz, rw;
  float sx, sy, sz;
} transform_t;

typedef float mat4[16];
static float g_pos_x[MAX_JOINTS * MAX_FRAMES];
static float g_pos_y[MAX_JOINTS * MAX_FRAMES];
static float g_pos_z[MAX_JOINTS * MAX_FRAMES];
static float g_rot_x[MAX_JOINTS * MAX_FRAMES];
static float g_rot_y[MAX_JOINTS * MAX_FRAMES];
static float g_rot_z[MAX_JOINTS * MAX_FRAMES];
static float g_rot_w[MAX_JOINTS * MAX_FRAMES];
static float g_scl_s[MAX_JOINTS * MAX_FRAMES];

static bitset_t g_used_pos_bits;
static bitset_t g_used_rot_bits;
static bitset_t g_used_scl_bits;

static int g_key_pos_list[MAX_FRAMES];
static int g_key_rot_list[MAX_FRAMES];
static int g_key_scl_list[MAX_FRAMES];

static unsigned short g_frame_to_key_pos[MAX_FRAMES];
static unsigned short g_frame_to_key_rot[MAX_FRAMES];
static unsigned short g_frame_to_key_scl[MAX_FRAMES];

static int g_kept[MAX_FRAMES];
static float g_tmp_pos[3 * MAX_FRAMES];
static float g_tmp_rot[4 * MAX_FRAMES];
static float g_tmp_scl[MAX_FRAMES];

static unsigned short g_keys_pos_x[MAX_FRAMES * MAX_JOINTS];
static unsigned short g_keys_pos_y[MAX_FRAMES * MAX_JOINTS];
static unsigned short g_keys_pos_z[MAX_FRAMES * MAX_JOINTS];
static unsigned short g_keys_scl[MAX_FRAMES * MAX_JOINTS];
static unsigned g_keys_rot[MAX_FRAMES * MAX_JOINTS];

static int g_bone_indices[MAX_JOINTS];
static int g_parents[MAX_ALL_JOINTS];
static float g_globals[MAX_ALL_JOINTS * MAT4_SIZE];
static interval_t g_stack[MAX_FRAMES];

static uint32_t
fnv1a_32(const char *str) {
  uint32_t hash = 0x811c9dc5u;
  const unsigned char *s = (const unsigned char *)str;
  while (*s != '\0') {
    hash ^= *s++;
    hash *= 0x01000193u;
  }
  return hash;
}
static void
mat4_set_identity(mat4 restrict m) {
  m[0] = m[5] = m[10] = m[15] = 1.0f;
  m[1] = m[2] = m[3] = m[4] = m[6] = m[7] = m[8] = 0.0f;
  m[9] = m[11] = m[12] = m[13] = m[14] = 0.0f;
}
static void
mat4_mul(const mat4 restrict a, const mat4 restrict b, mat4 restrict out) {
  for (int i = 0; i < 4; ++i) {
    for (int j = 0; j < 4; ++j) {
      out[i * 4 + j] = 0.0f;
      for (int k = 0; k < 4; ++k) {
        out[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
      }
    }
  }
}
static void
mat4_invert(const mat4 restrict m, mat4 restrict out) {
  float det = m[0] * (m[5] * m[10] - m[9] * m[6]) -
              m[4] * (m[1] * m[10] - m[9] * m[2]) +
              m[8] * (m[1] * m[6] - m[5] * m[2]);

  if (fabsf(det) < 1e-8f) {
    mat4_set_identity(out); // fallback
    return;
  }
  float idet = 1.0f / det;
  out[0] = (m[5] * m[10] - m[6] * m[9]) * idet;
  out[1] = (m[9] * m[2] - m[1] * m[10]) * idet;
  out[2] = (m[1] * m[6] - m[5] * m[2]) * idet;
  out[4] = (m[6] * m[8] - m[4] * m[10]) * idet;
  out[5] = (m[0] * m[10] - m[8] * m[2]) * idet;
  out[6] = (m[4] * m[2] - m[0] * m[6]) * idet;
  out[8] = (m[4] * m[9] - m[5] * m[8]) * idet;
  out[9] = (m[8] * m[1] - m[0] * m[9]) * idet;
  out[10] = (m[0] * m[5] - m[4] * m[1]) * idet;
  out[3] = out[7] = out[11] = 0.0f;
  out[15] = 1.0f;
  out[12] = -(out[0] * m[12] + out[4] * m[13] + out[8] * m[14]);
  out[13] = -(out[1] * m[12] + out[5] * m[13] + out[9] * m[14]);
  out[14] = -(out[2] * m[12] + out[6] * m[13] + out[10] * m[14]);
}
static void
mat4_from_trs(const float restrict pos[3], const float restrict rot[4],
              const float restrict scl[3], mat4 restrict out) {

  float xx = rot[0] * rot[0];
  float xy = rot[0] * rot[1];
  float xz = rot[0] * rot[2];
  float xw = rot[0] * rot[3];
  float yy = rot[1] * rot[1];
  float yz = rot[1] * rot[2];
  float yw = rot[1] * rot[3];
  float zz = rot[2] * rot[2];
  float zw = rot[2] * rot[3];

  float rm00 = 1.0f - 2.0f * (yy + zz);
  float rm01 = 2.0f * (xy - zw);
  float rm02 = 2.0f * (xz + yw);
  float rm10 = 2.0f * (xy + zw);
  float rm11 = 1.0f - 2.0f * (xx + zz);
  float rm12 = 2.0f * (yz - xw);
  float rm20 = 2.0f * (xz - yw);
  float rm21 = 2.0f * (yz + xw);
  float rm22 = 1.0f - 2.0f * (xx + yy);

  out[0] = rm00 * scl[0];
  out[1] = rm10 * scl[0];
  out[2] = rm20 * scl[0];
  out[4] = rm01 * scl[1];
  out[5] = rm11 * scl[1];
  out[6] = rm21 * scl[1];
  out[8] = rm02 * scl[2];
  out[9] = rm12 * scl[2];
  out[10] = rm22 * scl[2];
  out[3] = out[7] = out[11] = 0.0f;
  out[12] = pos[0];
  out[13] = pos[1];
  out[14] = pos[2];
  out[15] = 1.0f;
}
static void
mat4_decompose(const mat4 restrict m, float restrict pos[3],
              float restrict rot[4], float restrict scl[3]) {

  pos[0] = m[12];
  pos[1] = m[13];
  pos[2] = m[14];

  scl[0] = sqrtf(m[0] * m[0] + m[1] * m[1] + m[2] * m[2]);
  scl[1] = sqrtf(m[4] * m[4] + m[5] * m[5] + m[6] * m[6]);
  scl[2] = sqrtf(m[8] * m[8] + m[9] * m[9] + m[10] * m[10]);

  if (fabsf(scl[0]) < 1e-8f) {
    scl[0] = 1.0f;
  }
  if (fabsf(scl[1]) < 1e-8f) {
    scl[1] = 1.0f;
  }
  if (fabsf(scl[2]) < 1e-8f) {
    scl[2] = 1.0f;
  }
  float r[3][3];
  r[0][0] = m[0] / scl[0]; r[0][1] = m[4] / scl[1]; r[0][2] = m[8] / scl[2];
  r[1][0] = m[1] / scl[0]; r[1][1] = m[5] / scl[1]; r[1][2] = m[9] / scl[2];
  r[2][0] = m[2] / scl[0]; r[2][1] = m[6] / scl[1]; r[2][2] = m[10] / scl[2];

  float trace = r[0][0] + r[1][1] + r[2][2];
  if (trace > 0.0f) {
    float s = 0.5f / sqrtf(trace + 1.0f);
    rot[3] = 0.5f / s;
    rot[0] = (r[2][1] - r[1][2]) * s;
    rot[1] = (r[0][2] - r[2][0]) * s;
    rot[2] = (r[1][0] - r[0][1]) * s;
  } else if (r[0][0] > r[1][1] && r[0][0] > r[2][2]) {
    float s = 0.5f / sqrtf(1.0f + r[0][0] - r[1][1] - r[2][2]);
    rot[0] = 0.5f / s;
    rot[1] = (r[1][0] + r[0][1]) * s;
    rot[2] = (r[0][2] + r[2][0]) * s;
    rot[3] = (r[2][1] - r[1][2]) * s;
  } else if (r[1][1] > r[2][2]) {
    float s = 0.5f / sqrtf(1.0f + r[1][1] - r[0][0] - r[2][2]);
    rot[0] = (r[1][0] + r[0][1]) * s;
    rot[1] = 0.5f / s;
    rot[2] = (r[1][2] + r[2][1]) * s;
    rot[3] = (r[0][2] - r[2][0]) * s;
  } else {
    float s = 0.5f / sqrtf(1.0f + r[2][2] - r[0][0] - r[1][1]);
    rot[0] = (r[0][2] + r[2][0]) * s;
    rot[1] = (r[1][2] + r[2][1]) * s;
    rot[2] = 0.5f / s;
    rot[3] = (r[1][0] - r[0][1]) * s;
  }
  float len = sqrtf(rot[0] * rot[0] + rot[1] * rot[1] + rot[2] * rot[2] + rot[3] * rot[3]);
  if (len > 1e-8f) {
    float inv_len = 1.0f / len;
    rot[0] *= inv_len;
    rot[1] *= inv_len;
    rot[2] *= inv_len;
    rot[3] *= inv_len;
  }
}
static void
slerp(const float restrict *q0, const float restrict *q1,
      float t, float restrict *qout) {

  float q1_flip[4] = { q1[0], q1[1], q1[2], q1[3] };
  float dot = q0[0] * q1[0] + q0[1] * q1[1] + q0[2] * q1[2] + q0[3] * q1[3];
  if (dot < 0.0f) {
    q1_flip[0] = -q1[0];
    q1_flip[1] = -q1[1];
    q1_flip[2] = -q1[2];
    q1_flip[3] = -q1[3];
    dot = -dot;
  }
  if (dot > 0.9995f) {
    qout[0] = q0[0] + t * (q1_flip[0] - q0[0]);
    qout[1] = q0[1] + t * (q1_flip[1] - q0[1]);
    qout[2] = q0[2] + t * (q1_flip[2] - q0[2]);
    qout[3] = q0[3] + t * (q1_flip[3] - q0[3]);

  } else {
    float omega = acosf(dot);
    float so = sinf(omega);
    float a = sinf(omega * (1.0f - t)) / so;
    float b = sinf(omega * t) / so;

    qout[0] = a * q0[0] + b * q1_flip[0];
    qout[1] = a * q0[1] + b * q1_flip[1];
    qout[2] = a * q0[2] + b * q1_flip[2];
    qout[3] = a * q0[3] + b * q1_flip[3];
  }
  float norm = sqrtf(qout[0] * qout[0] + qout[1] * qout[1] + qout[2] * qout[2] + qout[3] * qout[3]);
  if (norm > 0.0f) {
    float inv_norm = 1.0f / norm;
    qout[0] *= inv_norm;
    qout[1] *= inv_norm;
    qout[2] *= inv_norm;
    qout[3] *= inv_norm;
  }
}
static void
eval_vec3(const cgltf_animation *anim, const cgltf_node *node,
          double time, cgltf_target_path_type path,
          float restrict *x, float restrict *y, float restrict *z) {

  cgltf_channel *chan = NULL;
  for (size_t ci = 0; ci < anim->channels_count; ++ci) {
    cgltf_channel *c = &anim->channels[ci];
    if (c->target_node == node && c->target_path == path) {
      if (chan) {
        fprintf(stderr, "Multiple channels for %s on node %s\n",
                (path == CGLTF_TARGET_PATH_TRANSLATION ? "translation" : "scale"),
                node->name ? node->name : "unnamed");
      }
      chan = c;
    }
  }
  if (!chan) {
    *x = 0.0f;
    *y = 0.0f;
    *z = (path == CGLTF_TARGET_PATH_SCALE ? 1.0f : 0.0f);
    return;
  }
  cgltf_sampler *samp = &anim->samplers[chan->sampler_index];
  cgltf_accessor *times_acc = samp->input;
  cgltf_accessor *data_acc = samp->output;
  if (times_acc->count == 0) {
    *x = 0.0f;
    *y = 0.0f;
    *z = (path == CGLTF_TARGET_PATH_SCALE ? 1.0f : 0.0f);
    return;
  }
  float tmin_f, tmax_f;
  cgltf_accessor_read_float(times_acc, 0, 0, &tmin_f);
  cgltf_accessor_read_float(times_acc, times_acc->count - 1, 0, &tmax_f);

  double tmin = (double)tmin_f;
  double tmax = (double)tmax_f;

  time = fmax(time, tmin);
  time = fmin(time, tmax);

  size_t count = times_acc->count;
  size_t i = 0;
  if (count > 1) {
    size_t low = 0, high = count - 1;
    while (low < high) {
      size_t mid = low + (high - low) / 2;

      float tmid_f;
      cgltf_accessor_read_float(times_acc, mid, 0, &tmid_f);
      if ((double)tmid_f <= time) {
        low = mid + 1;
      } else {
        high = mid;
      }
    }
    i = low - 1;
    if (i >= count) {
      i = count - 1;
    }
    if (i == (size_t)-1) {
      i = 0;
    }
  }
  float t0_f, t1_f = t0_f;
  cgltf_accessor_read_float(times_acc, i, 0, &t0_f);

  double t0 = (double)t0_f;
  bool at_end = (i == count - 1);
  float factor = 0.0f;
  if (!at_end) {
    cgltf_accessor_read_float(times_acc, i + 1, 0, &t1_f);
    double t1 = (double)t1_f;
    factor = (float)((time - t0) / (t1 - t0));
  }

  float vx0, vy0, vz0;
  float vx1 = vx0, vy1 = vy0, vz1 = vz0;
  cgltf_accessor_read_float(data_acc, i, 0, &vx0);
  cgltf_accessor_read_float(data_acc, i, 1, &vy0);
  cgltf_accessor_read_float(data_acc, i, 2, &vz0);

  if (!at_end) {
    cgltf_accessor_read_float(data_acc, i + 1, 0, &vx1);
    cgltf_accessor_read_float(data_acc, i + 1, 1, &vy1);
    cgltf_accessor_read_float(data_acc, i + 1, 2, &vz1);

    vx0 += factor * (vx1 - vx0);
    vy0 += factor * (vy1 - vy0);
    vz0 += factor * (vz1 - vz0);
  }
  *x = vx0;
  *y = vy0;
  *z = vz0;
}
static void
eval_quat(const cgltf_animation *anim, const cgltf_node *node,
          double time, float restrict *x, float restrict *y,
          float restrict *z, float restrict *w) {

  cgltf_channel *chan = NULL;
  for (size_t ci = 0; ci < anim->channels_count; ++ci) {
    cgltf_channel *c = &anim->channels[ci];
    if (c->target_node == node && c->target_path == CGLTF_TARGET_PATH_ROTATION) {
      if (chan) {
        fprintf(stderr, "Multiple rotation channels on node %s\n", node->name ? node->name : "unnamed");
      }
      chan = c;
    }
  }
  if (!chan) {
    *x = 0.0f;
    *y = 0.0f;
    *z = 0.0f;
    *w = 1.0f;
    return;
  }
  cgltf_sampler *samp = &anim->samplers[chan->sampler_index];
  cgltf_accessor *times_acc = samp->input;
  cgltf_accessor *data_acc = samp->output;
  if (times_acc->count == 0) {
    *x = 0.0f;
    *y = 0.0f;
    *z = 0.0f;
    *w = 1.0f;
    return;
  }
  float tmin_f, tmax_f;
  cgltf_accessor_read_float(times_acc, 0, 0, &tmin_f);
  cgltf_accessor_read_float(times_acc, times_acc->count - 1, 0, &tmax_f);

  double tmin = (double)tmin_f;
  double tmax = (double)tmax_f;

  time = fmax(time, tmin);
  time = fmin(time, tmax);

  size_t count = times_acc->count;
  size_t i = 0;
  if (count > 1) {
    size_t low = 0, high = count - 1;
    while (low < high) {
      size_t mid = low + (high - low) / 2;

      float tmid_f;
      cgltf_accessor_read_float(times_acc, mid, 0, &tmid_f);
      if ((double)tmid_f <= time) {
        low = mid + 1;
      } else {
        high = mid;
      }
    }
    i = low - 1;
    if (i >= count) {
      i = count - 1;
    }
    if (i == (size_t)-1){
      i = 0;
    }
  }
  float t0_f, t1_f = t0_f;
  cgltf_accessor_read_float(times_acc, i, 0, &t0_f);

  double t0 = (double)t0_f;
  bool at_end = (i == count - 1);
  float factor = 0.0f;
  if (!at_end) {
    cgltf_accessor_read_float(times_acc, i + 1, 0, &t1_f);
    double t1 = (double)t1_f;
    factor = (float)((time - t0) / (t1 - t0));
  }

  float q0[4], q1[4] = {0};
  cgltf_accessor_read_float(data_acc, i, 0, &q0[0]);
  cgltf_accessor_read_float(data_acc, i, 1, &q0[1]);
  cgltf_accessor_read_float(data_acc, i, 2, &q0[2]);
  cgltf_accessor_read_float(data_acc, i, 3, &q0[3]);

  if (!at_end) {
    cgltf_accessor_read_float(data_acc, i + 1, 0, &q1[0]);
    cgltf_accessor_read_float(data_acc, i + 1, 1, &q1[1]);
    cgltf_accessor_read_float(data_acc, i + 1, 2, &q1[2]);
    cgltf_accessor_read_float(data_acc, i + 1, 3, &q1[3]);

    float qout[4];
    slerp(q0, q1, factor, qout);

    q0[0] = qout[0];
    q0[1] = qout[1];
    q0[2] = qout[2];
    q0[3] = qout[3];
  }
  *x = q0[0];
  *y = q0[1];
  *z = q0[2];
  *w = q0[3];
}
static void
eval_transform(const cgltf_animation *anim, const cgltf_node *node,
               double time, transform_t *xf) {

  xf->tx = 0.0f; xf->ty = 0.0f; xf->tz = 0.0f;
  xf->rx = 0.0f; xf->ry = 0.0f; xf->rz = 0.0f; xf->rw = 1.0f;
  xf->sx = 1.0f; xf->sy = 1.0f; xf->sz = 1.0f;

  eval_vec3(anim, node, time, CGLTF_TARGET_PATH_TRANSLATION, &xf->tx, &xf->ty, &xf->tz);
  eval_vec3(anim, node, time, CGLTF_TARGET_PATH_SCALE, &xf->sx, &xf->sy, &xf->sz);
  eval_quat(anim, node, time, &xf->rx, &xf->ry, &xf->rz, &xf->rw);
}
static inline void
set_bit(bitset_t *bs, size_t idx) {
  bs->bits[idx / 64] |= (1ULL << (idx % 64));
}
static inline bool
test_bit(const bitset_t *bs, size_t idx) {
  return (bs->bits[idx / 64] & (1ULL << (idx % 64))) != 0;
}
static int
cmp_int(const void *a, const void *b) {
  return *(const int*)a - *(const int*)b;
}
static void
rdp_simplify(const float *data, int num, int dim, float eps,
             int restrict *kept, int restrict *count_out) {

  if (num <= 0) {
    *count_out = 0;
    return;
  }
  if (num == 1) {
    kept[0] = 0;
    *count_out = 1;
    return;
  }
  int stack_top = 0;
  g_stack[stack_top++] = (interval_t){0, num - 1};
  int kept_size = 0;

  while (stack_top > 0) {
    interval_t iv = g_stack[--stack_top];
    int start = iv.start;
    int end = iv.end;

    if (end - start < 1) {
      if (kept_size == 0 || kept[kept_size - 1] != start) {
        kept[kept_size++] = start;
      }
      continue;
    }
    float dx[DIM_MAX];
    float len2 = 0.0f;
    for (int d = 0; d < dim; ++d) {
      dx[d] = data[end * dim + d] - data[start * dim + d];
      len2 += dx[d] * dx[d];
    }
    if (len2 <= 0.0f) {
      if (kept_size == 0 || kept[kept_size - 1] != start) {
        kept[kept_size++] = start;
      }
      if (kept_size == 0 || kept[kept_size - 1] != end) {
        kept[kept_size++] = end;
      }
      continue;
    }
    float max_dist2 = 0.0f;
    int max_i = start;
    float inv_len2 = 1.0f / len2;
    for (int i = start + 1; i < end; ++i) {
      float t_num = 0.0f;
      for (int d = 0; d < dim; ++d) {
        t_num += (data[i * dim + d] - data[start * dim + d]) * dx[d];
      }
      float t = fmaxf(0.0f, fminf(1.0f, t_num * inv_len2));
      float dist2 = 0.0f;
      for (int d = 0; d < dim; ++d) {
        float proj = data[start * dim + d] + t * dx[d];
        float dif = data[i * dim + d] - proj;
        dist2 += dif * dif;
      }
      if (dist2 > max_dist2) {
        max_dist2 = dist2;
        max_i = i;
      }
    }
    float max_dist = sqrtf(max_dist2);
    if (max_dist <= eps) {
      if (kept_size == 0 || kept[kept_size - 1] != start) {
        kept[kept_size++] = start;
      }
      if (kept_size == 0 || kept[kept_size - 1] != end) {
        kept[kept_size++] = end;
      }
    } else {
      // Push right first, then left to simulate recursion order
      g_stack[stack_top++] = (interval_t){max_i, end};
      g_stack[stack_top++] = (interval_t){start, max_i};
    }
  }
  // sort and unique
  qsort(kept, kept_size, sizeof(int), cmp_int);
  int unique_size = 0;
  for (int i = 0; i < kept_size; ++i) {
    if (i == 0 || kept[i] != kept[i - 1]) {
      kept[unique_size++] = kept[i];
    }
  }
  *count_out = unique_size;
}
static unsigned
qenc(const float *qin) {
  float q[4];
  memcpy(q, qin, sizeof(float) * 4);

  float qabs[4];
  for (int i = 0; i < 4; ++i) {
    qabs[i] = fabsf(q[i]);
  }
  int top = 0;
  for (int i = 1; i < 4; ++i) {
    if (qabs[i] > qabs[top]) {
      top = i;
    }
  }
  unsigned msk = 511u;
  unsigned neg = (q[top] < 0.0f);
  unsigned ret = (unsigned)top;
  for (int i = 0; i < 4; ++i) {
    if (i == top) {
      continue;
    }
    unsigned negbit = ((q[i] < 0.0f) ^ neg);
    unsigned mag = (unsigned)(msk * (qabs[i] * 1.414213562f) + 0.5f);
    if (mag > msk) {
      mag = msk;
    }
    ret = (ret << 10u) | (negbit << 9u) | mag;
  }
  return ret;
}
extern int
main(int argc, char **argv) {
  if (argc < 3) {
    fprintf(stderr, "Usage: %s <gltf_file> <output_c_file> [clip_name] [joint_name1 [joint_name2 ...]]\n", argv[0]);
    fprintf(stderr, "If clip_name is provided, joint names follow it. Omitting joint names uses all skin joints.\n");
    return 1;
  }
  char *gltf_path = argv[1];
  char *out_path = argv[2];
  char *clip_name = (argc > 3) ? argv[3] : "def";
  int first_joint_arg = (argc > 3) ? 4 : 3;

  int num_joint_args = argc - first_joint_arg;
  if (num_joint_args > MAX_JOINTS) {
    fprintf(stderr, "Too many joint names provided: %d (max %d)\n", num_joint_args, MAX_JOINTS);
    return 1;
  }
  cgltf_data *scene = 0;
  cgltf_options options = { 0 };
  cgltf_result res = cgltf_parse_file(&options, gltf_path, &scene);
  if (res != cgltf_result_success) {
    fprintf(stderr, "Failed to parse GLTF: %d\n", res);
    return 1;
  }
  res = cgltf_load_buffers(&options, scene, gltf_path);
  if (res != cgltf_result_success) {
    fprintf(stderr, "Failed to load GLTF buffers: %d\n", res);
    cgltf_free(scene);
    return 1;
  }
  cgltf_skin *skin = (scene->skins_count > 0) ? scene->skins : NULL;
  if (!skin || skin->joints_count == 0) {
    fprintf(stderr, "No skin or joints found\n");
    cgltf_free(scene);
    return 1;
  }
  int g_all_cnt = (int)skin->joints_count;
  if (g_all_cnt > MAX_ALL_JOINTS) {
    fprintf(stderr, "Too many all joints: %d (max %d)\n", g_all_cnt, MAX_ALL_JOINTS);
    cgltf_free(scene);
    return 1;
  }
  // Build parents
  memset(g_parents, -1, sizeof(g_parents));
  for (int k = 0; k < g_all_cnt; ++k) {
    cgltf_node *node_k = skin->joints[k];
    for (size_t c = 0; c < node_k->children_count; ++c) {
      cgltf_node *ch = node_k->children[c];
      int cidx = -1;
      for (int jj = 0; jj < g_all_cnt; ++jj) {
        if (skin->joints[jj] == ch) {
          cidx = jj;
          break;
        }
      }
      if (cidx != -1) {
        g_parents[cidx] = k;
      }
    }
  }
  // Selected hashes
  uint32_t selected_hashes[MAX_JOINTS] = {0};
  for (int s = 0; s < num_joint_args; ++s) {
    selected_hashes[s] = fnv1a_32(argv[first_joint_arg + s]);
  }
  // Map to selected
  int sel_cnt = 0;
  unsigned long long found[MAX_JOINTS/64] = {0};
  if (num_joint_args == 0) {
    for (int aj = 0; aj < g_all_cnt; ++aj) {
      g_bone_indices[aj] = aj;
      sel_cnt++;
    }
  } else {
    for (int aj = 0; aj < g_all_cnt; ++aj) {
      const char *n = skin->joints[aj]->name ? skin->joints[aj]->name : "";
      uint32_t h = fnv1a_32(n);
      for (int s = 0; s < num_joint_args; ++s) {
        if (selected_hashes[s] == h && !(found[s/64] & (1llu << (s & 63)))) {
          found[s/64] = (1llu << (s & 63));
          g_bone_indices[sel_cnt] = aj;
          sel_cnt++;
          break;
        }
      }
    }
    int missing = 0;
    for (int s = 0; s < num_joint_args; ++s) {
      if (!(found[s/64] & (1llu << (s & 63)))) {
        missing++;
      }
    }
    if (missing > 0) {
      fprintf(stderr, "Warning: %d skeleton joint names not found in skin\n", missing);
    }
  }
  int joint_cnt = sel_cnt;
  if (joint_cnt == 0 || joint_cnt > MAX_JOINTS) {
    fprintf(stderr, "Invalid selected joint count: %d (must be 1-%d)\n", joint_cnt, MAX_JOINTS);
    cgltf_free(scene);
    return 1;
  }
  if (num_joint_args > 0) {
    printf("Using %d selected joints (out of %d provided; %d total in skin)\n", joint_cnt, num_joint_args, g_all_cnt);
  }

  cgltf_animation *anim = 0;
  size_t clip_len = strlen(clip_name);
  if (clip_len > 0) {
    for (size_t s = 0; s < scene->animations_count; ++s) {
      cgltf_animation *candidate = &scene->animations[s];
      if (candidate->name.length == clip_len &&
          strncmp(candidate->name.data, clip_name, clip_len) == 0) {
        anim = candidate;
        break;
      }
    }
  } else if (scene->animations_count > 0) {
    anim = &scene->animations[0];
  }
  if (!anim) {
    fprintf(stderr, "No animation available\n");
    cgltf_free(scene);
    return 1;
  }

  double duration_sec = 0.0;
  int has_anim_data = 0;
  for (size_t s = 0; s < anim->samplers_count; ++s) {
    cgltf_accessor *times_acc = anim->samplers[s].input;
    if (times_acc && times_acc->count > 0) {
      has_anim_data = 1;
      float last_time_f;
      cgltf_accessor_read_float(times_acc, times_acc->count - 1, 0, &last_time_f);
      double last = (double)last_time_f;
      if (last > duration_sec) {
        duration_sec = last;
      }
    }
  }
  if (!has_anim_data || duration_sec <= 0.0) {
    duration_sec = 0.0;
  }
  double fps = 60.0;
  int frame_cnt = 1;
  if (duration_sec > 0.0) {
    frame_cnt = (int)roundf(duration_sec * fps) + 1;
    if (frame_cnt > MAX_FRAMES) {
      frame_cnt = MAX_FRAMES;
    }
  }
  double dt_sec = (frame_cnt > 1) ? duration_sec / (frame_cnt - 1.0) : 0.0;
  for (int i = 0; i < frame_cnt; ++i) {
    double time_sec = (i == frame_cnt - 1) ? duration_sec : i * dt_sec;

    // Compute all local TRS
    float all_local_pos[MAX_ALL_JOINTS * 3];
    float all_local_rot[MAX_ALL_JOINTS * 4];
    float all_local_scl[MAX_ALL_JOINTS * 3];

    for (int aj = 0; aj < g_all_cnt; ++aj) {
      cgltf_node *node = skin->joints[aj];

      transform_t xf;
      eval_transform(anim, node, time_sec, &xf);

      int off = aj * 3;
      all_local_pos[off + 0] = xf.tx;
      all_local_pos[off + 1] = xf.ty;
      all_local_pos[off + 2] = xf.tz;

      off = aj * 4;
      all_local_rot[off + 0] = xf.rx;
      all_local_rot[off + 1] = xf.ry;
      all_local_rot[off + 2] = xf.rz;
      all_local_rot[off + 3] = xf.rw;

      off = aj * 3;
      all_local_scl[off + 0] = xf.sx;
      all_local_scl[off + 1] = xf.sy;
      all_local_scl[off + 2] = xf.sz;
    }
    // Compute globals
    {
      mat4 id;
      mat4_set_identity(id);

      // Set root globals
      int all_cnt = (int)skin->joints_count;
      for (int r = 0; r < all_cnt; ++r) {
        if (g_parents[r] != -1) continue;
        float lpos[3] = {all_local_pos[r * 3 + 0], all_local_pos[r * 3 + 1], all_local_pos[r * 3 + 2]};
        float lrot[4] = {all_local_rot[r * 4 + 0], all_local_rot[r * 4 + 1], all_local_rot[r * 4 + 2], all_local_rot[r * 4 + 3]};
        float lscl[3] = {all_local_scl[r * 3 + 0], all_local_scl[r * 3 + 1], all_local_scl[r * 3 + 2]};

        mat4 local_m;
        mat4_from_trs(lpos, lrot, lscl, local_m);

        mat4 root_g;
        mat4_mul(id, local_m, root_g);
        memcpy(g_globals + r * MAT4_SIZE, root_g, sizeof(mat4));
      }

      // Iterative DFS traversal using stack to simulate recursion
      int stack[MAX_ALL_JOINTS];
      int stack_top = 0;
      for (int r = 0; r < all_cnt; ++r) {
        if (g_parents[r] == -1) {
          stack[stack_top++] = r;
        }
      }
      while (stack_top > 0) {
        int idx = stack[--stack_top];
        // Process children in reverse order to match recursion depth-first order
        cgltf_node *node = skin->joints[idx];
        for (int cc = (int)node->children_count - 1; cc >= 0; --cc) {
          cgltf_node *chn = node->children[cc];
          int cidx = -1;
          for (int jj = 0; jj < all_cnt; ++jj) {
            if (skin->joints[jj] == chn) {
              cidx = jj;
              break;
            }
          }
          if (cidx == -1) {
            continue;
          }
          float lpos[3] = {{all_local_pos[cidx * 3 + 0], all_local_pos[cidx * 3 + 1], all_local_pos[cidx * 3 + 2]};
          float lrot[4] = {all_local_rot[cidx * 4 + 0], all_local_rot[cidx * 4 + 1], all_local_rot[cidx * 4 + 2], all_local_rot[cidx * 4 + 3]};
          float lscl[3] = {all_local_scl[cidx * 3 + 0], all_local_scl[cidx * 3 + 1], all_local_scl[cidx * 3 + 2]};

          mat4 local_m;
          mat4_from_trs(lpos, lrot, lscl, local_m);

          mat4 parent_g;
          memcpy(parent_g, g_globals + idx * MAT4_SIZE, sizeof(mat4));

          mat4 g;
          mat4_mul(parent_g, local_m, g);
          memcpy(g_globals + cidx * MAT4_SIZE, g, sizeof(mat4));
          // Push child to stack (safe since MAX_ALL_JOINTS limit)
          stack[stack_top++] = cidx;
        }
      }
    }
    // Back-compute locals for selected
    for (int j = 0; j < joint_cnt; ++j) {
      int idx = g_bone_indices[j];
      int off = j * frame_cnt + i;

      mat4 g;
      memcpy(g, g_globals + idx * MAT4_SIZE, sizeof(mat4));

      float lpos[3], lrot[4], lscl[3];
      if (g_parents[idx] == -1) {
        mat4_decompose(g, lpos, lrot, lscl);
      } else {
        int pidx = g_parents[idx];

        mat4 pg;
        memcpy(pg, g_globals + pidx * MAT4_SIZE, sizeof(mat4));

        mat4 parent_inv;
        mat4_invert(pg, parent_inv);

        mat4 local_m;
        mat4_mul(parent_inv, g, local_m);
        mat4_decompose(local_m, lpos, lrot, lscl);
      }
      g_pos_x[off] = lpos[0];
      g_pos_y[off] = lpos[1];
      g_pos_z[off] = lpos[2];

      g_rot_x[off] = lrot[0];
      g_rot_y[off] = lrot[1];
      g_rot_z[off] = lrot[2];
      g_rot_w[off] = lrot[3];

      g_scl_s[off] = (lscl[0] + lscl[1] + lscl[2]) / 3.0f;
    }
  }
  // Simplify
  float eps_pos = 0.01f;
  float eps_rot = 0.02f;
  float eps_scl = 0.001f;

  memset(g_used_pos_bits.bits, 0, sizeof(g_used_pos_bits.bits));
  memset(g_used_rot_bits.bits, 0, sizeof(g_used_rot_bits.bits));
  memset(g_used_scl_bits.bits, 0, sizeof(g_used_scl_bits.bits));

  for (int j = 0; j < joint_cnt; ++j) {
    // Position
    for (int f = 0; f < frame_cnt; ++f) {
      int off = j * frame_cnt + f;
      g_tmp_pos[f * 3 + 0] = g_pos_x[off];
      g_tmp_pos[f * 3 + 1] = g_pos_y[off];
      g_tmp_pos[f * 3 + 2] = g_pos_z[off];
    }
    int kept_count;
    rdp_simplify(g_tmp_pos, frame_cnt, 3, eps_pos, g_kept, &kept_count);
    for (int kk = 0; kk < kept_count; ++kk) {
      set_bit(&g_used_pos_bits, g_kept[kk]);
    }
    // Rotation
    for (int f = 0; f < frame_cnt; ++f) {
      int off = j * frame_cnt + f;
      g_tmp_rot[f * 4 + 0] = g_rot_x[off];
      g_tmp_rot[f * 4 + 1] = g_rot_y[off];
      g_tmp_rot[f * 4 + 2] = g_rot_z[off];
      g_tmp_rot[f * 4 + 3] = g_rot_w[off];
    }
    rdp_simplify(g_tmp_rot, frame_cnt, 4, eps_rot, g_kept, &kept_count);
    for (int kk = 0; kk < kept_count; ++kk) {
      set_bit(&g_used_rot_bits, g_kept[kk]);
    }
    // Scale
    for (int f = 0; f < frame_cnt; ++f) {
      int off = j * frame_cnt + f;
      g_tmp_scl[f] = g_scl_s[off];
    }
    rdp_simplify(g_tmp_scl, frame_cnt, 1, eps_scl, g_kept, &kept_count);
    for (int kk = 0; kk < kept_count; ++kk) {
      set_bit(&g_used_scl_bits, g_kept[kk]);
    }
  }
  // Build key lists
  int num_pos_keys = 0;
  for (int f = 0; f < frame_cnt; ++f) {
    if (test_bit(&g_used_pos_bits, f)) {
      g_key_pos_list[num_pos_keys++] = f;
    }
  }
  int num_rot_keys = 0;
  for (int f = 0; f < frame_cnt; ++f) {
    if (test_bit(&g_used_rot_bits, f)) {
      g_key_rot_list[num_rot_keys++] = f;
    }
  }
  int num_scl_keys = 0;
  for (int f = 0; f < frame_cnt; ++f) {
    if (test_bit(&g_used_scl_bits, f)) {
      g_key_scl_list[num_scl_keys++] = f;
    }
  }
  // Compute frame_to_key
  for (int i = 0; i < frame_cnt; ++i) {
    int k = num_pos_keys - 1;
    while (k >= 0 && g_key_pos_list[k] > i) {
      --k;
    }
    g_frame_to_key_pos[i] = (unsigned short)(k < 0 ? 0 : k);

    k = num_rot_keys - 1;
    while (k >= 0 && g_key_rot_list[k] > i) {
      --k;
    }
    g_frame_to_key_rot[i] = (unsigned short)(k < 0 ? 0 : k);

    k = num_scl_keys - 1;
    while (k >= 0 && g_key_scl_list[k] > i) {
      --k;
    }
    g_frame_to_key_scl[i] = (unsigned short)(k < 0 ? 0 : k);
  }
  // Compute quantization params
  float pos_scl_val = 1.0f;
  float scl_scl_val = 1.0f;
  float rot_scl_val = 1.0f;

  float pos_off = 0.0f;
  float scl_off = 0.0f;
  float rot_off = 0.0f;

  if (num_pos_keys > 0) {
    float pmin = INFINITY;
    float pmax = -INFINITY;

    for (int kk = 0; kk < num_pos_keys; ++kk) {
      int f = g_key_pos_list[kk];
      for (int j = 0; j < joint_cnt; ++j) {

        int off = j * frame_cnt + f;
        pmin = fminf(pmin, g_pos_x[off]);
        pmin = fminf(pmin, g_pos_y[off]);
        pmin = fminf(pmin, g_pos_z[off]);

        pmax = fmaxf(pmax, g_pos_x[off]);
        pmax = fmaxf(pmax, g_pos_y[off]);
        pmax = fmaxf(pmax, g_pos_z[off]);
      }
    }
    float range = pmax - pmin;
    pos_scl_val = (range > 0.0f) ? range / 65535.0f : 1.0f;
    pos_off = pmin;
  }
  if (num_scl_keys > 0) {
    float smin = INFINITY;
    float smax = -INFINITY;
    for (int kk = 0; kk < num_scl_keys; ++kk) {
      int f = g_key_scl_list[kk];
      for (int j = 0; j < joint_cnt; ++j) {
        int off = j * frame_cnt + f;
        smin = fminf(smin, g_scl_s[off]);
        smax = fmaxf(smax, g_scl_s[off]);
      }
    }
    float range = smax - smin;
    scl_scl_val = (range > 0.0f) ? range / 65535.0f : 1.0f;
    scl_off = smin;
  }
  // Generate keys
  for (int kk = 0; kk < num_pos_keys; ++kk) {
    int f = g_key_pos_list[kk];
    for (int j = 0; j < joint_cnt; ++j) {
      int idx = kk * joint_cnt + j;
      int off = j * frame_cnt + f;

      float valx = g_pos_x[off];
      float qx = (valx - pos_off) / pos_scl_val;
      g_keys_pos_x[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qx)));

      float valy = g_pos_y[off];
      float qy = (valy - pos_off) / pos_scl_val;
      g_keys_pos_y[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qy)));

      float valz = g_pos_z[off];
      float qz = (valz - pos_off) / pos_scl_val;
      g_keys_pos_z[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qz)));
    }
  }
  for (int kk = 0; kk < num_scl_keys; ++kk) {
    int f = g_key_scl_list[kk];
    for (int j = 0; j < joint_cnt; ++j) {
      int idx = kk * joint_cnt + j;
      int off = j * frame_cnt + f;
      float vals = g_scl_s[off];
      float qs = (vals - scl_off) / scl_scl_val;
      g_keys_scl[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qs)));
    }
  }
  for (int kk = 0; kk < num_rot_keys; ++kk) {
    int f = g_key_rot_list[kk];
    for (int j = 0; j < joint_cnt; ++j) {
      int idx = kk * joint_cnt + j;
      int off = j * frame_cnt + f;
      float qin[4] = {g_rot_x[off], g_rot_y[off], g_rot_z[off], g_rot_w[off]};
      g_keys_rot[idx] = qenc(qin);
    }
  }
  // Output to C file
  FILE *out = fopen(out_path, "w");
  if (!out) {
    perror("Failed to open output file");
    cgltf_free(scene);
    return 1;
  }
  size_t total_pos = (size_t)num_pos_keys * joint_cnt;
  fprintf(out, "static const unsigned short anm_%s_pos_x[] = {\n", clip_name);
  for (size_t ii = 0; ii < total_pos; ii += 8) {
    bool line_start = true;
    for (int jj = 0; jj < 8 && (ii + jj) < total_pos; ++jj) {
      if (!line_start) fputc(' ', out);
      fprintf(out, "0x%04x,", g_keys_pos_x[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  fprintf(out, "static const unsigned short anm_%s_pos_y[] = {\n", clip_name);
  for (size_t ii = 0; ii < total_pos; ii += 8) {
    bool line_start = true;
    for (int jj = 0; jj < 8 && (ii + jj) < total_pos; ++jj) {
      if (!line_start) fputc(' ', out);
      fprintf(out, "0x%04x,", g_keys_pos_y[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  fprintf(out, "static const unsigned short anm_%s_pos_z[] = {\n", clip_name);
  for (size_t ii = 0; ii < total_pos; ii += 8) {
    bool line_start = true;
    for (int jj = 0; jj < 8 && (ii + jj) < total_pos; ++jj) {
      if (!line_start) fputc(' ', out);
      fprintf(out, "0x%04x,", g_keys_pos_z[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  size_t total_rot = (size_t)num_rot_keys * joint_cnt;
  fprintf(out, "static const unsigned anm_%s_rot[] = {\n", clip_name);
  for (size_t ii = 0; ii < total_rot; ii += 8) {
    bool line_start = true;
    for (int jj = 0; jj < 8 && (ii + jj) < total_rot; ++jj) {
      if (!line_start) {
        fputc(' ', out);
      }
      fprintf(out, "0x%08x,", g_keys_rot[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  size_t total_scl = (size_t)num_scl_keys * joint_cnt;
  fprintf(out, "static const unsigned short anm_%s_scl[] = {\n", clip_name);
  for (size_t ii = 0; ii < total_scl; ii += 8) {
    bool line_start = true;
    for (int jj = 0; jj < 8 && (ii + jj) < total_scl; ++jj) {
      if (!line_start) {
        fputc(' ', out);
      }
      fprintf(out, "0x%04x,", g_keys_scl[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  fprintf(out, "static const unsigned short anm_%s_frame_to_key_pos[] = {\n", clip_name);
  for (int ii = 0; ii < frame_cnt; ii += 16) {
    bool line_start = true;
    for (int jj = 0; jj < 16 && (ii + jj) < frame_cnt; ++jj) {
      if (!line_start) {
        fputc(' ', out);
      }
      fprintf(out, "%hu,", g_frame_to_key_pos[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  fprintf(out, "static const unsigned short anm_%s_frame_to_key_rot[] = {\n", clip_name);
  for (int ii = 0; ii < frame_cnt; ii += 16) {
    bool line_start = true;
    for (int jj = 0; jj < 16 && (ii + jj) < frame_cnt; ++jj) {
      if (!line_start) {
        fputc(' ', out);
      }
      fprintf(out, "%hu,", g_frame_to_key_rot[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");

  fprintf(out, "static const unsigned short anm_%s_frame_to_key_scl[] = {\n", clip_name);
  for (int ii = 0; ii < frame_cnt; ii += 16) {
    bool line_start = true;
    for (int jj = 0; jj < 16 && (ii + jj) < frame_cnt; ++jj) {
      if (!line_start) {
        fputc(' ', out);
      }
      fprintf(out, "%hu,", g_frame_to_key_scl[ii + jj]);
      line_start = false;
    }
    fputc('\n', out);
  }
  fprintf(out, "};\n\n");
  fprintf(out, "static const struct anm_clip anm_%s_clip = {\n", clip_name);
  fprintf(out, "  .joint_cnt = %d,\n", joint_cnt);
  fprintf(out, "  .frame_cnt = %d,\n", frame_cnt);
  fprintf(out, "  .off = {%.6ff, %.6ff, %.6ff},\n", pos_off, rot_off, scl_off);
  fprintf(out, "  .scl = {%.6ff, %.6ff, %.6ff},\n", pos_scl_val, rot_scl_val, scl_scl_val);
  fprintf(out, "  .keys = {\n");
  fprintf(out, "    .pos_x = anm_%s_pos_x,\n", clip_name);
  fprintf(out, "    .pos_y = anm_%s_pos_y,\n", clip_name);
  fprintf(out, "    .pos_z = anm_%s_pos_z,\n", clip_name);
  fprintf(out, "    .rot_pos = anm_%s_rot,\n", clip_name);
  fprintf(out, "    .scl_s = anm_%s_scl,\n", clip_name);
  fprintf(out, "  },\n");
  fprintf(out, "  .blks = {\n");
  fprintf(out, "    .frame_to_key_pos = anm_%s_frame_to_key_pos,\n", clip_name);
  fprintf(out, "    .frame_to_key_rot = anm_%s_frame_to_key_rot,\n", clip_name);
  fprintf(out, "    .frame_to_key_scl = anm_%s_frame_to_key_scl,\n", clip_name);
  fprintf(out, "  },\n");
  fprintf(out, "};\n");

  fclose(out);
  cgltf_free(scene);
  cgltf_options_destroy(&options);

  printf("Generated animation clip '%s' with %d frames, pos_keys=%d, rot_keys=%d, scl_keys=%d\n",
         clip_name, frame_cnt, num_pos_keys, num_rot_keys, num_scl_keys);
  return 0;
}

## gltf2c_skel.c
#include "cgltf.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <float.h>
#include <stdbool.h>
#include <stdint.h>
#include <ctype.h>

#define MAX_JOINTS 256
typedef float mat4[16];

static uint32_t
fnv1a_32(const char *str) {
  uint32_t hash = 0x811c9dc5u;
  const unsigned char *s = (const unsigned char *)str;
  while (*s != '\0') {
    hash ^= *s++;
    hash *= 0x01000193u;
  }
  return hash;
}
static void
mat4_id(mat4 m) {
  m[0] = m[5] = m[10] = m[15] = 1.0f;
  m[1] = m[2] = m[3] = m[4] = m[6] = m[7] = 0.0f;
  m[8] = m[9] = m[11] = m[12] = m[13] = m[14] = 0.0f;
}
static void
mat4_mul(const mat4 restrict a, const mat4 restrict b, mat4 restrict out) {
  for (int i = 0; i < 4; ++i) {
    for (int j = 0; j < 4; ++j) {
      out[i * 4 + j] = 0.0f;
      for (int k = 0; k < 4; ++k) {
        out[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
      }
    }
  }
}
static int
mat4_try_invert(const mat4 restrict m, mat4 restrict out) {
  float det = m[0] * (m[5] * m[10] - m[9] * m[6]) -
              m[4] * (m[1] * m[10] - m[9] * m[2]) +
              m[8] * (m[1] * m[6] - m[5] * m[2]);

  if (fabsf(det) < 1e-8f) {
    mat4_id(out); // fallback
    return 0;
  }
  float idet = 1.0f / det;
  out[0] = (m[5] * m[10] - m[6] * m[9]) * idet;
  out[1] = (m[9] * m[2] - m[1] * m[10]) * idet;
  out[2] = (m[1] * m[6] - m[5] * m[2]) * idet;
  out[4] = (m[6] * m[8] - m[4] * m[10]) * idet;
  out[5] = (m[0] * m[10] - m[8] * m[2]) * idet;
  out[6] = (m[4] * m[2] - m[0] * m[6]) * idet;
  out[8] = (m[4] * m[9] - m[5] * m[8]) * idet;
  out[9] = (m[8] * m[1] - m[0] * m[9]) * idet;
  out[10] = (m[0] * m[5] - m[4] * m[1]) * idet;
  out[3] = out[7] = out[11] = 0.0f;
  out[15] = 1.0f;
  out[12] = -(out[0] * m[12] + out[4] * m[13] + out[8] * m[14]);
  out[13] = -(out[1] * m[12] + out[5] * m[13] + out[9] * m[14]);
  out[14] = -(out[2] * m[12] + out[6] * m[13] + out[10] * m[14]);
  return 1;
}
static void
mat4_compose(const float *restrict rot, const float *restrict pos,
             const float *restrict scl, mat4 out) {

  float xx = rot[0] * rot[0];
  float xy = rot[0] * rot[1];
  float xz = rot[0] * rot[2];
  float xw = rot[0] * rot[3];
  float yy = rot[1] * rot[1];
  float yz = rot[1] * rot[2];
  float yw = rot[1] * rot[3];
  float zz = rot[2] * rot[2];
  float zw = rot[2] * rot[3];

  float rm00 = 1.0f - 2.0f * (yy + zz);
  float rm01 = 2.0f * (xy - zw);
  float rm02 = 2.0f * (xz + yw);
  float rm10 = 2.0f * (xy + zw);
  float rm11 = 1.0f - 2.0f * (xx + zz);
  float rm12 = 2.0f * (yz - xw);
  float rm20 = 2.0f * (xz - yw);
  float rm21 = 2.0f * (yz + xw);
  float rm22 = 1.0f - 2.0f * (xx + yy);

  out[0] = rm00 * scl[0];
  out[1] = rm10 * scl[0];
  out[2] = rm20 * scl[0];
  out[4] = rm01 * scl[1];
  out[5] = rm11 * scl[1];
  out[6] = rm21 * scl[1];
  out[8] = rm02 * scl[2];
  out[9] = rm12 * scl[2];
  out[10] = rm22 * scl[2];
  out[3] = out[7] = out[11] = 0.0f;
  out[12] = pos[0];
  out[13] = pos[1];
  out[14] = pos[2];
  out[15] = 1.0f;
}
extern int
main(int argc, char **argv) {
  if (argc < 4) {
    fprintf(stderr, "Usage: %s <gltf_file> <output_c_file> <skeleton_name> [skin_name]\n", argv[0]);
    return 1;
  }
  const char *gltf_path = argv[1];
  const char *out_path = argv[2];
  const char *skeleton_name = argv[3];
  const char *skin_name = (argc > 4) ? argv[4] : 0;

  /* load and parse the gltf file */
  cgltf_options opt = {0};
  cgltf_result res = cgltf_parse_file(&opt, gltf_path, &data);
  if (res != cgltf_result_success) {
    fprintf(stderr, "[GLTF]: Failed to parse GLTF: %d\n", res);
    return 1;
  }
  cgltf_data *data = 0;
  res = cgltf_load_buffers(&opt, data, gltf_path);
  if (res != cgltf_result_success) {
    fprintf(stderr, "[GLTF]: Failed to load GLTF buffers: %d\n", res);
    return 1;
  }
  /* try to find the skin */
  const cgltf_skin *skn = 0;
  if (skin_name && skin_name[0] != '\0') {
    for (size_t s = 0u; s < data->skins_count; ++s) {
      const cgltf_skin *cur = &data->skins[s];
      if (cur->name && !strcmp(cur->name, skin_name)) {
        skn = cur;
        break;
      }
    }
    if (!skn) {
      fprintf(stderr, "[GLTF]: Specified skin '%s' not found; falling back to first skin.\n", skin_name);
    }
  }
  if (!skn && data->skins_count > 0) {
    skn = &data->skins[0];
  }
  if (!skn || skn->joints_count == 0) {
    fprintf(stderr, "[GLTF]: No skin or joints found\n");
    return 1;
  }
  int skin_jnt_cnt = (int)skn->joints_count;
  if (skin_jnt_cnt > MAX_JOINTS) {
    fprintf(stderr, "[GLTF]: Too many joints: %d (max %d)\n", skin_jnt_cnt, MAX_JOINTS);
    return 1;
  }
  /* try to find skin root node */
  const cgltf_node *root = 0;
  for (size_t i = 0; i < skin_jnt_cnt; ++i) {
    const cgltf_node *node = skn->joints[i];
    if (!node->parent) {
      root = node;
      break;
    }
  }
  if (!root) {
    for (size_t i = 0; i < skin_jnt_cnt; ++i) {
      const cgltf_node *node = skn->joints[i];
      if (node->parent && !node->parent->parent) {
        root = node->parent;
        break;
      }
    }
  }
  if (!root) {
    fprintf(stderr, "[GLTF]: Couldn't find the root node in skin: '%s'\n", skin_name);
    return 1;
  }
  /* iterate over hierarchy */
  int bone_cnt = 0;
  struct skel_bone {
    int parent;
    unsigned hash;
    const char *name;
    mat4 local;
    mat4 skin_to_world;
  }
  skeleton[MAX_JOINTS];
  {
    int stk_top = 0;
    struct stk_elm {
      int parent;
      const cgltf_node *node;
      mat4 skin_to_world;
    } stk[MAX_JOINTS];
    {
      stk[stk_top].parent = -1;
      stk[stk_top].node = root;
      mat4_id(stk[stk_top].skin_to_world);
      stk_top++;
    }
    mat4 node_to_world[MAX_JOINTS];
    while (stk_top > 1) {
      const struct stk_elm elm = stk[--stk_top];
      const cgltf_node *cur = elm->node;

      /* skip non-skin joints */
      size_t skin_jnt_idx = 0;
      for (; skin_jnt_idx < skin_jnt_cnt; ++skin_jnt_idx) {
        const cgltf_node *node = skn->joints[skin_jnt_idx];
        if (cur == node) {
          break;
        }
      }
      if (skin_jnt_idx >= skin_jnt_cnt) {
        continue;
      }
      int bone_idx = bone_cnt++;
      struct skel_bone *bone = skeleton[bone_idx];
      bone->parent = elm->parent;
      bone->name = cur->name;
      bone->hash = fnv1a_32(cur->name);

      mat4 jnt_to_world;
      if (cur->parent == -1) {
        memcpy(bone->local, cur->skin_to_world, sizeof(mat4));
        mat4_id(jnt_to_world);
      } else {
        float pos[3] = {0,0,0};
        if (node->has_translation) {
          pos[0] = node->translation[0];
          pos[1] = node->translation[1];
          pos[2] = node->translation[2];
        }
        float rot[4] = {0,0,0,1.0f};
        if (node->has_rotation) {
          rot[0] = node->rotation[0];
          rot[1] = node->rotation[1];
          rot[2] = node->rotation[2];
          rot[3] = node->rotation[3];
        }
        float scl[3] = {1.0f,1.0f,1.0f};
        if (node->has_scale) {
          scl[0] = node->scale[0];
          scl[1] = node->scale[1];
          scl[2] = node->scale[2];
        }
        mat4_compose(rot, pos, scl, bone->local);
        mat4_mul(bone->local, node_to_world[cur->parent], jnt_to_world);
      }
      memcpy(&node_to_world[bone_idx], jnt_to_world, sizeof(mat4));

      // we want to have the skin to bone matrix so first create the world to skin matrix
      mat4 world_to_skin;
      if (!mat4_try_invert(cur->skin_to_world, world_to_skin)) {
        fprintf(stderr, "[GLTF]: failed to invert skin to world matrix in skin '%s' at joint '%s'\n", skin_name, bone->name);
        return 1;
      }
      // concatenate the bone to world and the world to skin matrices to one bone to skin matrix
      mat4 bone_to_skin;
      mat4_mul(jnt_to_world, world_to_skin, bone_to_skin);
      if (!mat4_try_invert(bone_to_skin, bone->skin_to_world)) {
        fprintf(stderr, "[GLTF]: failed to invert the bone to skin matrix in skin '%s' at joint '%s'\n", skin_name, bone->name);
        return 1;
      }
      // process all children (reverse to keep correct sequence through stack)
      for (size_t i = cur->children_count; i > 0; --i) {
        const struct stk_elm *elm = &stk[stk_top++];
        elm->node = cur->children[i];
        elm->parent = bone_idx;
        memcpy(elm->skin_to_world, world_to_skin, sizeof(mat4));
      }
    }
  }
  // Output
  FILE *out = fopen(out_path, "w");
  if (!out) {
    perror("Failed to open output file");
    cgltf_free(data);
    return 1;
  }
  fprintf(out, "static const struct bone skl_%s[] = {\n", skeleton_name);
  for (int i = 0; i < joint_cnt; ++i) {
    const struct skel_bone *bone = &skeleton[i];
    fprintf(out, "  { .parent = %d, .hash = 0x%08x, .name = \"%s\", .skin_to_world = {", bone->parent, bone->hash, bone->name);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[0], bone->skin_to_world[1], bone->skin_to_world[2]);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[4], bone->skin_to_world[5], bone->skin_to_world[6]);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[8], bone->skin_to_world[9], bone->skin_to_world[10]);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff }", bone->skin_to_world[12], bone->skin_to_world[13], bone->skin_to_world[14]);
    fprintf(out, "    },\n", ",");
    fprintf(out, "    .pose = { %.6ff, %.6ff, %.6ff },", bone->local[0], bone->local[1], bone->local[2]);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff },", bone->local[4], bone->local[5], bone->local[6]);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff },", bone->local[8], bone->local[9], bone->local[10]);
    fprintf(out, "      { %.6ff, %.6ff, %.6ff }", bone->local[12], bone->local[13], bone->local[14]);
    fprintf(out, "    } }\n");
  }
  fprintf(out, "};\n");

  fclose(out);
  cgltf_free(data);
  cgltf_options_destroy(&opt);

  const char *skin_used = skin->name ? skin->name : "default";
  printf("Generated skeleton '%s' with %d bones from skin '%s'\n", skeleton_name, joint_cnt, skin_used);
  return 0;
}
	#include "cgltf.h"
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <string.h>
	#include <assert.h>
	#include <float.h>
	#include <stdbool.h>
	#include <stdint.h>
	#include <ctype.h>

	#define MAX_JOINTS 256
	typedef float mat4[16];

	static uint32_t
	fnv1a_32(const char *str) {
	uint32_t hash = 0x811c9dc5u;
	const unsigned char s = (const unsigned char )str;
	while (*s != '\0') {
	hash ^= *s++;
	hash *= 0x01000193u;
	}
	return hash;
	}
	static void
	mat4_id(mat4 m) {
	m[0] = m[5] = m[10] = m[15] = 1.0f;
	m[1] = m[2] = m[3] = m[4] = m[6] = m[7] = 0.0f;
	m[8] = m[9] = m[11] = m[12] = m[13] = m[14] = 0.0f;
	}
	static void
	mat4_mul(const mat4 restrict a, const mat4 restrict b, mat4 restrict out) {
	for (int i = 0; i < 4; ++i) {
	for (int j = 0; j < 4; ++j) {
	out[i * 4 + j] = 0.0f;
	for (int k = 0; k < 4; ++k) {
	out[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
	}
	}
	}
	}
	static int
	mat4_try_invert(const mat4 restrict m, mat4 restrict out) {
	float det = m[0] * (m[5] * m[10] - m[9] * m[6]) -
	m[4] * (m[1] * m[10] - m[9] * m[2]) +
	m[8] * (m[1] * m[6] - m[5] * m[2]);

	if (fabsf(det) < 1e-8f) {
	mat4_id(out); // fallback
	return 0;
	}
	float idet = 1.0f / det;
	out[0] = (m[5] * m[10] - m[6] * m[9]) * idet;
	out[1] = (m[9] * m[2] - m[1] * m[10]) * idet;
	out[2] = (m[1] * m[6] - m[5] * m[2]) * idet;
	out[4] = (m[6] * m[8] - m[4] * m[10]) * idet;
	out[5] = (m[0] * m[10] - m[8] * m[2]) * idet;
	out[6] = (m[4] * m[2] - m[0] * m[6]) * idet;
	out[8] = (m[4] * m[9] - m[5] * m[8]) * idet;
	out[9] = (m[8] * m[1] - m[0] * m[9]) * idet;
	out[10] = (m[0] * m[5] - m[4] * m[1]) * idet;
	out[3] = out[7] = out[11] = 0.0f;
	out[15] = 1.0f;
	out[12] = -(out[0] * m[12] + out[4] * m[13] + out[8] * m[14]);
	out[13] = -(out[1] * m[12] + out[5] * m[13] + out[9] * m[14]);
	out[14] = -(out[2] * m[12] + out[6] * m[13] + out[10] * m[14]);
	return 1;
	}
	static void
	mat4_compose(const float restrict rot, const float restrict pos,
	const float *restrict scl, mat4 out) {

	float xx = rot[0] * rot[0];
	float xy = rot[0] * rot[1];
	float xz = rot[0] * rot[2];
	float xw = rot[0] * rot[3];
	float yy = rot[1] * rot[1];
	float yz = rot[1] * rot[2];
	float yw = rot[1] * rot[3];
	float zz = rot[2] * rot[2];
	float zw = rot[2] * rot[3];

	float rm00 = 1.0f - 2.0f * (yy + zz);
	float rm01 = 2.0f * (xy - zw);
	float rm02 = 2.0f * (xz + yw);
	float rm10 = 2.0f * (xy + zw);
	float rm11 = 1.0f - 2.0f * (xx + zz);
	float rm12 = 2.0f * (yz - xw);
	float rm20 = 2.0f * (xz - yw);
	float rm21 = 2.0f * (yz + xw);
	float rm22 = 1.0f - 2.0f * (xx + yy);

	out[0] = rm00 * scl[0];
	out[1] = rm10 * scl[0];
	out[2] = rm20 * scl[0];
	out[4] = rm01 * scl[1];
	out[5] = rm11 * scl[1];
	out[6] = rm21 * scl[1];
	out[8] = rm02 * scl[2];
	out[9] = rm12 * scl[2];
	out[10] = rm22 * scl[2];
	out[3] = out[7] = out[11] = 0.0f;
	out[12] = pos[0];
	out[13] = pos[1];
	out[14] = pos[2];
	out[15] = 1.0f;
	}
	extern int
	main(int argc, char **argv) {
	if (argc < 4) {
	fprintf(stderr, "Usage: %s <gltf_file> <output_c_file> <skeleton_name> [skin_name]\n", argv[0]);
	return 1;
	}
	const char *gltf_path = argv[1];
	const char *out_path = argv[2];
	const char *skeleton_name = argv[3];
	const char *skin_name = (argc > 4) ? argv[4] : 0;

	/* load and parse the gltf file */
	cgltf_options opt = {0};
	cgltf_result res = cgltf_parse_file(&opt, gltf_path, &data);
	if (res != cgltf_result_success) {
	fprintf(stderr, "[GLTF]: Failed to parse GLTF: %d\n", res);
	return 1;
	}
	cgltf_data *data = 0;
	res = cgltf_load_buffers(&opt, data, gltf_path);
	if (res != cgltf_result_success) {
	fprintf(stderr, "[GLTF]: Failed to load GLTF buffers: %d\n", res);
	return 1;
	}
	/* try to find the skin */
	const cgltf_skin *skn = 0;
	if (skin_name && skin_name[0] != '\0') {
	for (size_t s = 0u; s < data->skins_count; ++s) {
	const cgltf_skin *cur = &data->skins[s];
	if (cur->name && !strcmp(cur->name, skin_name)) {
	skn = cur;
	break;
	}
	}
	if (!skn) {
	fprintf(stderr, "[GLTF]: Specified skin '%s' not found; falling back to first skin.\n", skin_name);
	}
	}
	if (!skn && data->skins_count > 0) {
	skn = &data->skins[0];
	}
	if (!skn \|\| skn->joints_count == 0) {
	fprintf(stderr, "[GLTF]: No skin or joints found\n");
	return 1;
	}
	int skin_jnt_cnt = (int)skn->joints_count;
	if (skin_jnt_cnt > MAX_JOINTS) {
	fprintf(stderr, "[GLTF]: Too many joints: %d (max %d)\n", skin_jnt_cnt, MAX_JOINTS);
	return 1;
	}
	/* try to find skin root node */
	const cgltf_node *root = 0;
	for (size_t i = 0; i < skin_jnt_cnt; ++i) {
	const cgltf_node *node = skn->joints[i];
	if (!node->parent) {
	root = node;
	break;
	}
	}
	if (!root) {
	for (size_t i = 0; i < skin_jnt_cnt; ++i) {
	const cgltf_node *node = skn->joints[i];
	if (node->parent && !node->parent->parent) {
	root = node->parent;
	break;
	}
	}
	}
	if (!root) {
	fprintf(stderr, "[GLTF]: Couldn't find the root node in skin: '%s'\n", skin_name);
	return 1;
	}
	/* iterate over hierarchy */
	int bone_cnt = 0;
	struct skel_bone {
	int parent;
	unsigned hash;
	const char *name;
	mat4 local;
	mat4 skin_to_world;
	}
	skeleton[MAX_JOINTS];
	{
	int stk_top = 0;
	struct stk_elm {
	int parent;
	const cgltf_node *node;
	mat4 skin_to_world;
	} stk[MAX_JOINTS];
	{
	stk[stk_top].parent = -1;
	stk[stk_top].node = root;
	mat4_id(stk[stk_top].skin_to_world);
	stk_top++;
	}
	mat4 node_to_world[MAX_JOINTS];
	while (stk_top > 1) {
	const struct stk_elm elm = stk[--stk_top];
	const cgltf_node *cur = elm->node;

	/* skip non-skin joints */
	size_t skin_jnt_idx = 0;
	for (; skin_jnt_idx < skin_jnt_cnt; ++skin_jnt_idx) {
	const cgltf_node *node = skn->joints[skin_jnt_idx];
	if (cur == node) {
	break;
	}
	}
	if (skin_jnt_idx >= skin_jnt_cnt) {
	continue;
	}
	int bone_idx = bone_cnt++;
	struct skel_bone *bone = skeleton[bone_idx];
	bone->parent = elm->parent;
	bone->name = cur->name;
	bone->hash = fnv1a_32(cur->name);

	mat4 jnt_to_world;
	if (cur->parent == -1) {
	memcpy(bone->local, cur->skin_to_world, sizeof(mat4));
	mat4_id(jnt_to_world);
	} else {
	float pos[3] = {0,0,0};
	if (node->has_translation) {
	pos[0] = node->translation[0];
	pos[1] = node->translation[1];
	pos[2] = node->translation[2];
	}
	float rot[4] = {0,0,0,1.0f};
	if (node->has_rotation) {
	rot[0] = node->rotation[0];
	rot[1] = node->rotation[1];
	rot[2] = node->rotation[2];
	rot[3] = node->rotation[3];
	}
	float scl[3] = {1.0f,1.0f,1.0f};
	if (node->has_scale) {
	scl[0] = node->scale[0];
	scl[1] = node->scale[1];
	scl[2] = node->scale[2];
	}
	mat4_compose(rot, pos, scl, bone->local);
	mat4_mul(bone->local, node_to_world[cur->parent], jnt_to_world);
	}
	memcpy(&node_to_world[bone_idx], jnt_to_world, sizeof(mat4));

	// we want to have the skin to bone matrix so first create the world to skin matrix
	mat4 world_to_skin;
	if (!mat4_try_invert(cur->skin_to_world, world_to_skin)) {
	fprintf(stderr, "[GLTF]: failed to invert skin to world matrix in skin '%s' at joint '%s'\n", skin_name, bone->name);
	return 1;
	}
	// concatenate the bone to world and the world to skin matrices to one bone to skin matrix
	mat4 bone_to_skin;
	mat4_mul(jnt_to_world, world_to_skin, bone_to_skin);
	if (!mat4_try_invert(bone_to_skin, bone->skin_to_world)) {
	fprintf(stderr, "[GLTF]: failed to invert the bone to skin matrix in skin '%s' at joint '%s'\n", skin_name, bone->name);
	return 1;
	}
	// process all children (reverse to keep correct sequence through stack)
	for (size_t i = cur->children_count; i > 0; --i) {
	const struct stk_elm *elm = &stk[stk_top++];
	elm->node = cur->children[i];
	elm->parent = bone_idx;
	memcpy(elm->skin_to_world, world_to_skin, sizeof(mat4));
	}
	}
	}
	// Output
	FILE *out = fopen(out_path, "w");
	if (!out) {
	perror("Failed to open output file");
	cgltf_free(data);
	return 1;
	}
	fprintf(out, "static const struct bone skl_%s[] = {\n", skeleton_name);
	for (int i = 0; i < joint_cnt; ++i) {
	const struct skel_bone *bone = &skeleton[i];
	fprintf(out, " { .parent = %d, .hash = 0x%08x, .name = \"%s\", .skin_to_world = {", bone->parent, bone->hash, bone->name);
	fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[0], bone->skin_to_world[1], bone->skin_to_world[2]);
	fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[4], bone->skin_to_world[5], bone->skin_to_world[6]);
	fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[8], bone->skin_to_world[9], bone->skin_to_world[10]);
	fprintf(out, " { %.6ff, %.6ff, %.6ff }", bone->skin_to_world[12], bone->skin_to_world[13], bone->skin_to_world[14]);
	fprintf(out, " },\n", ",");
	fprintf(out, " .pose = { %.6ff, %.6ff, %.6ff },", bone->local[0], bone->local[1], bone->local[2]);
	fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->local[4], bone->local[5], bone->local[6]);
	fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->local[8], bone->local[9], bone->local[10]);
	fprintf(out, " { %.6ff, %.6ff, %.6ff }", bone->local[12], bone->local[13], bone->local[14]);
	fprintf(out, " } }\n");
	}
	fprintf(out, "};\n");

	fclose(out);
	cgltf_free(data);
	cgltf_options_destroy(&opt);

	const char *skin_used = skin->name ? skin->name : "default";
	printf("Generated skeleton '%s' with %d bones from skin '%s'\n", skeleton_name, joint_cnt, skin_used);
	return 0;
	}
No results found