Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active November 5, 2025 22:47
Show Gist options
  • Select an option

  • Save vurtun/6a4f284f75e3133586b04b6692dac0fd to your computer and use it in GitHub Desktop.

Select an option

Save vurtun/6a4f284f75e3133586b04b6692dac0fd to your computer and use it in GitHub Desktop.
animation sampling
#include <immintrin.h> // AVX, includes SSE2-SSE4.2
#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>
#define MAX_ANIM 1024
#define MAX_ANM_CLIPS 256
#define BATCH_SIZE 4
#define MAX_JOINTS 256
#define ANIM_TRK_ELM_CNT 7 // x, y, z, scl, rot_x, rot_y, rot_z
#define align32 __attribute__((aligned(32)))
// Animation track types
enum anm_trks {
ANM_TRK_POS, // 3 components (x, y, z)
ANM_TRK_ROT, // 1 quaternion (32-bit encoded)
ANM_TRK_SCL, // 1 component (uniform scale)
ANM_TRK_CNT
};
struct anm_clip {
int joint_cnt;
int frame_cnt; // Total number of frames
float off[ANM_TRK_CNT]; // Quantization offset factors for pos, rot, scl
float scl[ANM_TRK_CNT]; // Quantization scaling factors for pos, rot, scl
struct {
align32 unsigned short const *pos_x;
align32 unsigned short const *pos_y;
align32 unsigned short const *pos_z;
align32 unsigned const *rot_pos;
align32 unsigned short const *scl_s;
} keys;
struct {
align32 unsigned short const *frame_to_key_pos;
align32 unsigned short const *frame_to_key_rot;
align32 unsigned short const *frame_to_key_scl;
} blks;
};
struct anm_cache {
align32 float pos_x[2 * MAX_JOINTS];
align32 float pos_y[2 * MAX_JOINTS];
align32 float pos_z[2 * MAX_JOINTS];
align32 float scl_s[2 * MAX_JOINTS];
align32 float rot_x[2 * MAX_JOINTS];
align32 float rot_y[2 * MAX_JOINTS];
align32 float rot_z[2 * MAX_JOINTS];
align32 float rot_w[2 * MAX_JOINTS];
int cached_anim;
int cached_frame;
int cached_next;
int prev_slot;
};
struct anm_sys {
short free_idx_cnt;
short free_idx[MAX_ANIM];
align32 int anim[MAX_ANIM];
align32 float exact_frame[MAX_ANIM];
align32 int frame[MAX_ANIM];
align32 int frame_nxt[MAX_ANIM];
align32 int frame_cnt[MAX_ANIM];
align32 float t_0[MAX_ANIM];
align32 float t_1[MAX_ANIM];
struct anm_cache caches[MAX_ANIM];
};
static struct anm_sys anm_sys;
enum anm_clip_id {
ANM_CLIP_DEFAULT,
ANM_CLIP_CNT,
};
static const align32 unsigned short anm_def_pos_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_pos_y[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_pos_z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned anm_def_rot[] = {
0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000,
0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000
};
static const align32 unsigned short anm_def_scl[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
static const align32 unsigned short anm_def_frame_to_key_pos[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_frame_to_key_rot[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static const align32 unsigned short anm_def_frame_to_key_scl[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static const struct anm_clip anm_def_clip = {
.joint_cnt = 8,
.frame_cnt = 1,
.off = {0.0f, 0.0f, 0.0f},
.scl = {1.0f, 1.0f, 1.0f},
.keys = {
.pos_x = anm_def_pos_x,
.pos_y = anm_def_pos_y,
.pos_z = anm_def_pos_z,
.rot_pos = anm_def_rot,
.scl_s = anm_def_scl,
},
.blks = {
.frame_to_key_pos = anm_def_frame_to_key_pos,
.frame_to_key_rot = anm_def_frame_to_key_rot,
.frame_to_key_scl = anm_def_frame_to_key_scl,
},
};
static const struct anm_clip *anm_clips[ANM_CLIP_CNT] = {
[ANM_CLIP_DEFAULT] = &anm_def_clip,
};
extern void
anm_init(struct anm_sys *ans) {
ans->free_idx_cnt = MAX_ANIM;
for (int i = 0; i < MAX_ANIM; ++i) {
ans->free_idx[i] = MAX_ANIM - i - 1;
}
for (int i = 0; i < MAX_ANM_CLIPS; ++i) {
ans->caches[i].cached_anim = -1;
}
}
extern int
anm_add(struct anm_sys *ans, enum anm_clip_id clip_id) {
const struct anm_clip *clip = anm_clips[clip_id];
int anm = ans->free_idx[--ans->free_idx_cnt];
ans->anim[anm] = clip_id;
ans->frame[anm] = 0;
ans->frame_nxt[anm] = 1;
ans->exact_frame[anm] = 0.0f;
ans->frame_cnt[anm] = clip->frame_cnt;
ans->t_0[anm] = 0.0f;
ans->t_1[anm] = 0.0f;
return anm;
}
extern void
anm_del(struct anm_sys *ans, int anm) {
ans->free_idx[ans->free_idx_cnt++] = anm;
ans->anim[anm] = 0;
ans->frame[anm] = 0;
ans->frame_nxt[anm] = 0;
ans->exact_frame[anm] = 0.0f;
ans->frame_cnt[anm] = 0;
ans->t_0[anm] = 0.0f;
ans->t_1[anm] = 0.0f;
}
static inline void
qdec_avx(__m256 *qout_x, __m256 *qout_y, __m256 *qout_z, __m256 *qout_w, const unsigned *qin) {
__m256 half = _mm256_set1_ps(0.707106781f);
__m256 inv_msk = _mm256_set1_ps(1.0f / 511.0f);
__m256 one = _mm256_set1_ps(1.0f);
__m256 zero = _mm256_setzero_ps();
__m256i q = _mm256_load_si256((__m256i*)qin);
__m256i top = _mm256_srli_epi32(q, 30);
__m256i mask = _mm256_set1_epi32(511);
// Component 0 (highest non-top)
__m256i mag0 = _mm256_and_si256(q, mask);
__m256i negbit0 = _mm256_and_si256(_mm256_srli_epi32(q, 9), _mm256_set1_epi32(1));
q = _mm256_srli_epi32(q, 10);
__m256 pos_val0 = _mm256_mul_ps(_mm256_cvtepi32_ps(mag0), _mm256_mul_ps(inv_msk, half));
__m256 sqr = _mm256_fmadd_ps(pos_val0, pos_val0, zero);
__m256 neg_mask0 = _mm256_cvtepi32_ps(negbit0);
__m256 val0 = _mm256_blendv_ps(pos_val0, _mm256_sub_ps(zero, pos_val0), neg_mask0);
// Component 1 (mid non-top)
__m256i mag1 = _mm256_and_si256(q, mask);
__m256i negbit1 = _mm256_and_si256(_mm256_srli_epi32(q, 9), _mm256_set1_epi32(1));
q = _mm256_srli_epi32(q, 10);
__m256 pos_val1 = _mm256_mul_ps(_mm256_cvtepi32_ps(mag1), _mm256_mul_ps(inv_msk, half));
sqr = _mm256_fmadd_ps(pos_val1, pos_val1, sqr);
__m256 neg_mask1 = _mm256_cvtepi32_ps(negbit1);
__m256 val1 = _mm256_blendv_ps(pos_val1, _mm256_sub_ps(zero, pos_val1), neg_mask1);
// Component 2 (lowest non-top)
__m256i mag2 = _mm256_and_si256(q, mask);
__m256i negbit2 = _mm256_and_si256(_mm256_srli_epi32(q, 9), _mm256_set1_epi32(1));
q = _mm256_srli_epi32(q, 10);
__m256 pos_val2 = _mm256_mul_ps(_mm256_cvtepi32_ps(mag2), _mm256_mul_ps(inv_msk, half));
sqr = _mm256_fmadd_ps(pos_val2, pos_val2, sqr);
__m256 neg_mask2 = _mm256_cvtepi32_ps(negbit2);
__m256 val2 = _mm256_blendv_ps(pos_val2, _mm256_sub_ps(zero, pos_val2), neg_mask2);
// Missing component
__m256 diff = _mm256_max_ps(_mm256_sub_ps(one, sqr), zero);
__m256 root = _mm256_sqrt_ps(diff);
__m256 mask_eq0 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(0)));
__m256 mask_eq1 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(1)));
__m256 mask_eq2 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(2)));
__m256 mask_eq3 = _mm256_castsi256_ps(_mm256_cmpeq_epi32(top, _mm256_set1_epi32(3)));
// Assign based on top (val0: highest non-top, val1: mid, val2: lowest non-top)
*qout_x = _mm256_blendv_ps(val2, root, mask_eq0);
__m256 temp_y = _mm256_blendv_ps(val1, val2, mask_eq0);
*qout_y = _mm256_blendv_ps(temp_y, root, mask_eq1);
__m256 temp_z_nonroot = _mm256_blendv_ps(val0, val1, _mm256_or_ps(mask_eq0, mask_eq1));
*qout_z = _mm256_blendv_ps(temp_z_nonroot, root, mask_eq2);
*qout_w = _mm256_blendv_ps(val0, root, mask_eq3);
}
extern void
anim_update(float* out, struct anm_sys *ans, int anm_start, int anm_end) {
{
// Precompute linear coefficients and update frames for all models
__m256 zero = _mm256_setzero_ps();
__m256 one = _mm256_set1_ps(1.0f);
__m256i one_i = _mm256_set1_epi32(1);
__m256i zero_i = _mm256_setzero_si256(); // For lower clamp
for (int i = anm_start; i < anm_end; i += 8) {
__m256i frame_cnt = _mm256_load_si256((__m256i*)&ans->frame_cnt[i]);
__m256 exact = _mm256_load_ps(&ans->exact_frame[i]);
__m256i frame_i = _mm256_castps_si256(exact); // trunc toward zero
__m256i frame_cnt_m1 = _mm256_sub_epi32(frame_cnt, one_i);
__m256 frame_ps = _mm256_cvtepi32_ps(frame_i);
__m256 frame_cnt_m1_ps = _mm256_cvtepi32_ps(frame_cnt_m1);
__m256 gt_mask = _mm256_cmp_ps(frame_ps, frame_cnt_m1_ps, _CMP_GT_OQ);
__m256i clamp_mask_i = _mm256_castps_si256(gt_mask);
frame_i = _mm256_blendv_epi8(frame_i, frame_cnt_m1, clamp_mask_i);
__m256 lt_mask = _mm256_cmp_ps(frame_ps, zero, _CMP_LT_OQ); // < 0?
__m256i lt_mask_i = _mm256_castps_si256(lt_mask);
frame_i = _mm256_blendv_epi8(frame_i, zero_i, lt_mask_i);
__m256 t = _mm256_sub_ps(exact, _mm256_cvtepi32_ps(frame_i));
__m256i frame_1_tst = _mm256_add_epi32(frame_i, one_i);
__m256 tst_ps = _mm256_cvtepi32_ps(frame_1_tst);
__m256 cnt_ps = _mm256_cvtepi32_ps(frame_cnt);
__m256 ge_mask = _mm256_cmp_ps(tst_ps, cnt_ps, _CMP_GE_OQ);
__m256i sel_mask_i = _mm256_castps_si256(ge_mask);
__m256i frame_1 = _mm256_blendv_epi8(frame_1_tst, frame_i, sel_mask_i);
__m256 t_clamped = _mm256_min_ps(_mm256_max_ps(t, zero), one);
// Compute linear basis functions
__m256 h00 = _mm256_sub_ps(one, t_clamped);
__m256 h01 = t_clamped;
// Store
_mm256_store_ps(&ans->t_0[i], h00);
_mm256_store_ps(&ans->t_1[i], h01);
_mm256_store_si256((__m256i*)&ans->frame_nxt[i], frame_1);
_mm256_store_si256((__m256i*)&ans->frame[i], frame_i);
}
}
for (int m = anm_start; m < anm_end; ++m) {
int anm_id = ans->anim[m];
const struct anm_clip *clp = anm_clips[anm_id];
int frame = ans->frame[m];
int frame_next = ans->frame_nxt[m];
int key_idx_pos[2] = {clp->blks.frame_to_key_pos[frame], clp->blks.frame_to_key_pos[frame_next]};
int key_idx_rot[2] = {clp->blks.frame_to_key_rot[frame], clp->blks.frame_to_key_rot[frame_next]};
int key_idx_scl[2] = {clp->blks.frame_to_key_scl[frame], clp->blks.frame_to_key_scl[frame_next]};
const __m256 pos_off = _mm256_set1_ps(clp->off[ANM_TRK_POS]);
const __m256 rot_off = _mm256_set1_ps(clp->off[ANM_TRK_ROT]);
const __m256 scl_off = _mm256_set1_ps(clp->off[ANM_TRK_SCL]);
const __m256 pos_scl = _mm256_set1_ps(clp->scl[ANM_TRK_POS]);
const __m256 rot_scl = _mm256_set1_ps(clp->scl[ANM_TRK_ROT]);
const __m256 scl_scl = _mm256_set1_ps(clp->scl[ANM_TRK_SCL]);
__m256 h00 = _mm256_set1_ps(ans->t_0[m]);
__m256 h01 = _mm256_set1_ps(ans->t_1[m]);
const int stride = clp->joint_cnt;
float *out_ptr = out + m * ANIM_TRK_ELM_CNT * stride;
int anm_mod = ans->caches[m].cached_anim == anm_id;
int full_hit = (ans->caches[m].cached_frame == frame && ans->caches[m].cached_next == frame_next);
int advance = !full_hit && (ans->caches[m].cached_frame != -1) && (frame == ans->caches[m].cached_next);
if (anm_mod || !full_hit) {
if (!anm_mod && advance) {
// Double-buffer toggle: Swap slots (new prev = old next; new next = old prev)
ans->caches[m].prev_slot = 1 - ans->caches[m].prev_slot;
ans->caches[m].cached_frame = frame;
ans->caches[m].cached_next = frame_next;
// Dequant only new frame_next to new next_slot
int next_slot = 1 - ans->caches[m].prev_slot; // Now the free slot (old prev)
int base_pos1 = key_idx_pos[1] * stride;
int base_scl1 = key_idx_scl[1] * stride;
int base_rot1 = key_idx_rot[1] * stride;
for (int jj = 0; jj < clp->joint_cnt; jj += 8) {
int j1 = jj; // Batch offset
// Position frame 1 (new next)
__m128i shorts1_x = _mm_load_si128((__m128i*)(clp->keys.pos_x + base_pos1 + j1));
__m256i mag1_x = _mm256_cvtepu16_epi32(shorts1_x);
__m256 pos1_x = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_x), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_x[next_slot * stride + jj], pos1_x);
__m128i shorts1_y = _mm_load_si128((__m128i*)(clp->keys.pos_y + base_pos1 + j1));
__m256i mag1_y = _mm256_cvtepu16_epi32(shorts1_y);
__m256 pos1_y = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_y), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_y[next_slot * stride + jj], pos1_y);
__m128i shorts1_z = _mm_load_si128((__m128i*)(clp->keys.pos_z + base_pos1 + j1));
__m256i mag1_z = _mm256_cvtepu16_epi32(shorts1_z);
__m256 pos1_z = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_z), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_z[next_slot * stride + jj], pos1_z);
// Scale frame 1 (new next)
__m128i shorts_s1 = _mm_load_si128((__m128i*)(clp->keys.scl_s + base_scl1 + j1));
__m256i mag_s1 = _mm256_cvtepu16_epi32(shorts_s1);
__m256 scl1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag_s1), scl_scl), scl_off);
_mm256_store_ps(&ans->caches[m].scl_s[next_slot * stride + jj], scl1);
// Rotation frame 1 (new next)
align32 unsigned rot1_data[8];
__m256i rot1_packed = _mm256_load_si256((__m256i*)(clp->keys.rot_pos + base_rot1 + j1));
_mm256_store_si256((__m256i*)rot1_data, rot1_packed);
__m256 rot1_x_raw, rot1_y_raw, rot1_z_raw, rot1_w_raw;
qdec_avx(&rot1_x_raw, &rot1_y_raw, &rot1_z_raw, &rot1_w_raw, rot1_data);
__m256 rot1_x = _mm256_add_ps(_mm256_mul_ps(rot1_x_raw, rot_scl), rot_off);
_mm256_store_ps(&ans->caches[m].rot_x[next_slot * stride + jj], rot1_x);
__m256 rot1_y = _mm256_add_ps(_mm256_mul_ps(rot1_y_raw, rot_scl), rot_off);
_mm256_store_ps(&ans->caches[m].rot_y[next_slot * stride + jj], rot1_y);
__m256 rot1_z = _mm256_add_ps(_mm256_mul_ps(rot1_z_raw, rot_scl), rot_off);
_mm256_store_ps(&ans->caches[m].rot_z[next_slot * stride + jj], rot1_z);
__m256 rot1_w = _mm256_add_ps(_mm256_mul_ps(rot1_w_raw, rot_scl), rot_off);
_mm256_store_ps(&ans->caches[m].rot_w[next_slot * stride + jj], rot1_w);
}
} else {
// Full miss/seek: Dequant both to fixed slots 0 (frame) / 1 (frame_next)
ans->caches[m].prev_slot = 0;
ans->caches[m].cached_frame = frame;
ans->caches[m].cached_next = frame_next;
// Dequant full pair to cache (slot 0: frame, slot 1: frame_next)
for (int jj = 0; jj < clp->joint_cnt; jj += 8) {
int base_pos0 = key_idx_pos[0] * stride + jj;
int base_pos1 = key_idx_pos[1] * stride + jj;
int base_scl0 = key_idx_scl[0] * stride + jj;
int base_scl1 = key_idx_scl[1] * stride + jj;
int base_rot0 = key_idx_rot[0] * stride + jj;
int base_rot1 = key_idx_rot[1] * stride + jj;
// Position frame 0 (slot 0)
__m128i shorts0_x = _mm_load_si128((__m128i*)(clp->keys.pos_x + base_pos0));
__m256i mag0_x = _mm256_cvtepu16_epi32(shorts0_x);
__m256 pos0_x = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag0_x), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_x[0 * stride + jj], pos0_x);
__m128i shorts0_y = _mm_load_si128((__m128i*)(clp->keys.pos_y + base_pos0));
__m256i mag0_y = _mm256_cvtepu16_epi32(shorts0_y);
__m256 pos0_y = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag0_y), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_y[0 * stride + jj], pos0_y);
__m128i shorts0_z = _mm_load_si128((__m128i*)(clp->keys.pos_z + base_pos0));
__m256i mag0_z = _mm256_cvtepu16_epi32(shorts0_z);
__m256 pos0_z = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag0_z), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_z[0 * stride + jj], pos0_z);
// Position frame 1 (slot 1)
__m128i shorts1_x = _mm_load_si128((__m128i*)(clp->keys.pos_x + base_pos1));
__m256i mag1_x = _mm256_cvtepu16_epi32(shorts1_x);
__m256 pos1_x = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_x), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_x[1 * stride + jj], pos1_x);
__m128i shorts1_y = _mm_load_si128((__m128i*)(clp->keys.pos_y + base_pos1));
__m256i mag1_y = _mm256_cvtepu16_epi32(shorts1_y);
__m256 pos1_y = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_y), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_y[1 * stride + jj], pos1_y);
__m128i shorts1_z = _mm_load_si128((__m128i*)(clp->keys.pos_z + base_pos1));
__m256i mag1_z = _mm256_cvtepu16_epi32(shorts1_z);
__m256 pos1_z = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag1_z), pos_scl), pos_off);
_mm256_store_ps(&ans->caches[m].pos_z[1 * stride + jj], pos1_z);
// Scale frame 0 (slot 0)
__m128i shorts_s0 = _mm_load_si128((__m128i*)(clp->keys.scl_s + base_scl0));
__m256i mag_s0 = _mm256_cvtepu16_epi32(shorts_s0);
__m256 scl0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag_s0), scl_scl), scl_off);
_mm256_store_ps(&ans->caches[m].scl_s[0 * stride + jj], scl0);
// Scale frame 1 (slot 1)
__m128i shorts_s1 = _mm_load_si128((__m128i*)(clp->keys.scl_s + base_scl1));
__m256i mag_s1 = _mm256_cvtepu16_epi32(shorts_s1);
__m256 scl1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(mag_s1), scl_scl), scl_off);
_mm256_store_ps(&ans->caches[m].scl_s[1 * stride + jj], scl1);
// Rotation frame 0 (slot 0)
align32 unsigned rot0_data[8];
__m256i rot0_packed = _mm256_load_si256((__m256i*)(clp->keys.rot_pos + base_rot0));
_mm256_store_si256((__m256i*)rot0_data, rot0_packed);
__m256 rot0_x_raw, rot0_y_raw, rot0_z_raw, rot0_w_raw;
qdec_avx(&rot0_x_raw, &rot0_y_raw, &rot0_z_raw, &rot0_w_raw, rot0_data);
__m256 rot0_x = _mm256_add_ps(_mm256_mul_ps(rot0_x_raw, rot_scl), rot_off);
__m256 rot0_y = _mm256_add_ps(_mm256_mul_ps(rot0_y_raw, rot_scl), rot_off);
__m256 rot0_z = _mm256_add_ps(_mm256_mul_ps(rot0_z_raw, rot_scl), rot_off);
__m256 rot0_w = _mm256_add_ps(_mm256_mul_ps(rot0_w_raw, rot_scl), rot_off);
_mm256_store_ps(&ans->caches[m].rot_x[0 * stride + jj], rot0_x);
_mm256_store_ps(&ans->caches[m].rot_y[0 * stride + jj], rot0_y);
_mm256_store_ps(&ans->caches[m].rot_z[0 * stride + jj], rot0_z);
_mm256_store_ps(&ans->caches[m].rot_w[0 * stride + jj], rot0_w);
// Rotation frame 1 (slot 1)
align32 unsigned rot1_data[8];
__m256i rot1_packed = _mm256_load_si256((__m256i*)(clp->keys.rot_pos + base_rot1));
_mm256_store_si256((__m256i*)rot1_data, rot1_packed);
__m256 rot1_x_raw, rot1_y_raw, rot1_z_raw, rot1_w_raw;
qdec_avx(&rot1_x_raw, &rot1_y_raw, &rot1_z_raw, &rot1_w_raw, rot1_data);
__m256 rot1_x = _mm256_add_ps(_mm256_mul_ps(rot1_x_raw, rot_scl), rot_off);
__m256 rot1_y = _mm256_add_ps(_mm256_mul_ps(rot1_y_raw, rot_scl), rot_off);
__m256 rot1_z = _mm256_add_ps(_mm256_mul_ps(rot1_z_raw, rot_scl), rot_off);
__m256 rot1_w = _mm256_add_ps(_mm256_mul_ps(rot1_w_raw, rot_scl), rot_off);
_mm256_store_ps(&ans->caches[m].rot_x[1 * stride + jj], rot1_x);
_mm256_store_ps(&ans->caches[m].rot_y[1 * stride + jj], rot1_y);
_mm256_store_ps(&ans->caches[m].rot_z[1 * stride + jj], rot1_z);
_mm256_store_ps(&ans->caches[m].rot_w[1 * stride + jj], rot1_w);
}
}
}
int next_slot = 1 - ans->caches[m].prev_slot;
int prev_idx_base = ans->caches[m].prev_slot * stride;
int next_idx_base = next_slot * stride;
float t = ans->exact_frame[m] - ans->frame[m];
// Joint loop: Use dynamic indices for loads
for (int j = 0; j < clp->joint_cnt; j += 8) {
int prev_idx = prev_idx_base + j;
int next_idx = next_idx_base + j;
// Position
__m256 pos0_x = _mm256_load_ps(&ans->caches[m].pos_x[prev_idx]);
__m256 pos1_x = _mm256_load_ps(&ans->caches[m].pos_x[next_idx]);
__m256 pos0_y = _mm256_load_ps(&ans->caches[m].pos_y[prev_idx]);
__m256 pos1_y = _mm256_load_ps(&ans->caches[m].pos_y[next_idx]);
__m256 pos0_z = _mm256_load_ps(&ans->caches[m].pos_z[prev_idx]);
__m256 pos1_z = _mm256_load_ps(&ans->caches[m].pos_z[next_idx]);
// Scale
__m256 scl0 = _mm256_load_ps(&ans->caches[m].scl_s[prev_idx]);
__m256 scl1 = _mm256_load_ps(&ans->caches[m].scl_s[next_idx]);
// Rotation
__m256 rot0_x = _mm256_load_ps(&ans->caches[m].rot_x[prev_idx]);
__m256 rot1_x = _mm256_load_ps(&ans->caches[m].rot_x[next_idx]);
__m256 rot0_y = _mm256_load_ps(&ans->caches[m].rot_y[prev_idx]);
__m256 rot1_y = _mm256_load_ps(&ans->caches[m].rot_y[next_idx]);
__m256 rot0_z = _mm256_load_ps(&ans->caches[m].rot_z[prev_idx]);
__m256 rot1_z = _mm256_load_ps(&ans->caches[m].rot_z[next_idx]);
__m256 rot0_w = _mm256_load_ps(&ans->caches[m].rot_w[prev_idx]);
__m256 rot1_w = _mm256_load_ps(&ans->caches[m].rot_w[next_idx]);
// Linear interpolation for position and scale
__m256 res_x = _mm256_fmadd_ps(h01, pos1_x, _mm256_mul_ps(h00, pos0_x));
__m256 res_y = _mm256_fmadd_ps(h01, pos1_y, _mm256_mul_ps(h00, pos0_y));
__m256 res_z = _mm256_fmadd_ps(h01, pos1_z, _mm256_mul_ps(h00, pos0_z));
__m256 res_scl = _mm256_fmadd_ps(h01, scl1, _mm256_mul_ps(h00, scl0));
// ONLERP for rotation
__m256 zero = _mm256_setzero_ps();
__m256 one = _mm256_set1_ps(1.0f);
__m256 tt = _mm256_set1_ps(t);
__m256 ca = _mm256_add_ps(
_mm256_add_ps(_mm256_mul_ps(rot0_x, rot1_x), _mm256_mul_ps(rot0_y, rot1_y)),
_mm256_add_ps(_mm256_mul_ps(rot0_z, rot1_z), _mm256_mul_ps(rot0_w, rot1_w)));
__m256 mask_neg = _mm256_cmp_ps(ca, zero, _CMP_LT_OS);
__m256 fabs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
__m256 d = _mm256_and_ps(ca, fabs_mask);
__m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
rot1_x = _mm256_xor_ps(rot1_x, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
rot1_y = _mm256_xor_ps(rot1_y, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
rot1_z = _mm256_xor_ps(rot1_z, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
rot1_w = _mm256_xor_ps(rot1_w, _mm256_and_ps(sign_mask, _mm256_castps_si256(mask_neg)));
// Compute A = 1.0904 + d * (-3.2452 + d * (3.55645 - d * 1.43519))
__m256 tmp_a1 = _mm256_fmadd_ps(d, _mm256_set1_ps(-1.43519f), _mm256_set1_ps(3.55645f));
__m256 tmp_a2 = _mm256_fmadd_ps(d, tmp_a1, _mm256_set1_ps(-3.2452f));
__m256 A = _mm256_fmadd_ps(d, tmp_a2, _mm256_set1_ps(1.0904f));
// Compute B = 0.848013 + d * (-1.06021 + d * 0.215638)
__m256 tmp_b1 = _mm256_fmadd_ps(d, _mm256_set1_ps(0.215638f), _mm256_set1_ps(-1.06021f));
__m256 B = _mm256_fmadd_ps(d, tmp_b1, _mm256_set1_ps(0.848013f));
__m256 dt05 = _mm256_sub_ps(tt, _mm256_set1_ps(0.5f));
__m256 dt05_sq = _mm256_mul_ps(dt05, dt05);
__m256 k = _mm256_fmadd_ps(A, dt05_sq, B);
__m256 t_m05 = _mm256_sub_ps(tt, _mm256_set1_ps(0.5f));
__m256 t_m1 = _mm256_sub_ps(tt, one);
__m256 term = _mm256_mul_ps(tt, _mm256_mul_ps(t_m05, t_m1));
__m256 ot = _mm256_fmadd_ps(k, term, tt);
__m256 lt = _mm256_sub_ps(one, ot);
__m256 res_rot_x = _mm256_fmadd_ps(ot, rot1_x, _mm256_mul_ps(lt, rot0_x));
__m256 res_rot_y = _mm256_fmadd_ps(ot, rot1_y, _mm256_mul_ps(lt, rot0_y));
__m256 res_rot_z = _mm256_fmadd_ps(ot, rot1_z, _mm256_mul_ps(lt, rot0_z));
__m256 res_rot_w = _mm256_fmadd_ps(ot, rot1_w, _mm256_mul_ps(lt, rot0_w));
// Normalize
__m256 un = _mm256_fmadd_ps(res_rot_w, res_rot_w,
_mm256_fmadd_ps(res_rot_z, res_rot_z,
_mm256_fmadd_ps(res_rot_y, res_rot_y,
_mm256_mul_ps(res_rot_x, res_rot_x))));
__m256 us0 = _mm256_rsqrt_ps(un);
__m256 us1 = _mm256_mul_ps(_mm256_mul_ps(_mm256_set1_ps(0.5f), us0),
_mm256_sub_ps(_mm256_set1_ps(3.0f),
_mm256_mul_ps(_mm256_mul_ps(us0, us0), un)));
res_rot_x = _mm256_mul_ps(res_rot_x, us1);
res_rot_y = _mm256_mul_ps(res_rot_y, us1);
res_rot_z = _mm256_mul_ps(res_rot_z, us1);
res_rot_w = _mm256_mul_ps(res_rot_w, us1); // FIX: Scale w component too
// Stream
_mm256_stream_ps(out_ptr + 0 * stride + j, res_x);
_mm256_stream_ps(out_ptr + 1 * stride + j, res_y);
_mm256_stream_ps(out_ptr + 2 * stride + j, res_z);
_mm256_stream_ps(out_ptr + 3 * stride + j, res_scl);
_mm256_stream_ps(out_ptr + 4 * stride + j, res_rot_x);
_mm256_stream_ps(out_ptr + 5 * stride + j, res_rot_y);
_mm256_stream_ps(out_ptr + 6 * stride + j, res_rot_z);
}
}
_mm_sfence(); // Ensure all streaming stores complete
}
#include "cgltf.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <float.h>
#include <stdbool.h>
#include <limits.h>
#include <stdint.h>
#define MAX_JOINTS 256
#define MAX_ALL_JOINTS 1024
#define MAX_FRAMES (16*1024)
#define DIM_MAX 4
#define MAT4_SIZE 16
typedef struct {
uint64_t bits[MAX_FRAMES/64]; // 256*64 = 16384 bits
} bitset_t;
typedef struct {
int start;
int end;
} interval_t;
typedef struct {
float tx, ty, tz;
float rx, ry, rz, rw;
float sx, sy, sz;
} transform_t;
typedef float mat4[16];
static float g_pos_x[MAX_JOINTS * MAX_FRAMES];
static float g_pos_y[MAX_JOINTS * MAX_FRAMES];
static float g_pos_z[MAX_JOINTS * MAX_FRAMES];
static float g_rot_x[MAX_JOINTS * MAX_FRAMES];
static float g_rot_y[MAX_JOINTS * MAX_FRAMES];
static float g_rot_z[MAX_JOINTS * MAX_FRAMES];
static float g_rot_w[MAX_JOINTS * MAX_FRAMES];
static float g_scl_s[MAX_JOINTS * MAX_FRAMES];
static bitset_t g_used_pos_bits;
static bitset_t g_used_rot_bits;
static bitset_t g_used_scl_bits;
static int g_key_pos_list[MAX_FRAMES];
static int g_key_rot_list[MAX_FRAMES];
static int g_key_scl_list[MAX_FRAMES];
static unsigned short g_frame_to_key_pos[MAX_FRAMES];
static unsigned short g_frame_to_key_rot[MAX_FRAMES];
static unsigned short g_frame_to_key_scl[MAX_FRAMES];
static int g_kept[MAX_FRAMES];
static float g_tmp_pos[3 * MAX_FRAMES];
static float g_tmp_rot[4 * MAX_FRAMES];
static float g_tmp_scl[MAX_FRAMES];
static unsigned short g_keys_pos_x[MAX_FRAMES * MAX_JOINTS];
static unsigned short g_keys_pos_y[MAX_FRAMES * MAX_JOINTS];
static unsigned short g_keys_pos_z[MAX_FRAMES * MAX_JOINTS];
static unsigned short g_keys_scl[MAX_FRAMES * MAX_JOINTS];
static unsigned g_keys_rot[MAX_FRAMES * MAX_JOINTS];
static int g_bone_indices[MAX_JOINTS];
static int g_parents[MAX_ALL_JOINTS];
static float g_globals[MAX_ALL_JOINTS * MAT4_SIZE];
static interval_t g_stack[MAX_FRAMES];
static uint32_t
fnv1a_32(const char *str) {
uint32_t hash = 0x811c9dc5u;
const unsigned char *s = (const unsigned char *)str;
while (*s != '\0') {
hash ^= *s++;
hash *= 0x01000193u;
}
return hash;
}
static void
mat4_set_identity(mat4 restrict m) {
m[0] = m[5] = m[10] = m[15] = 1.0f;
m[1] = m[2] = m[3] = m[4] = m[6] = m[7] = m[8] = 0.0f;
m[9] = m[11] = m[12] = m[13] = m[14] = 0.0f;
}
static void
mat4_mul(const mat4 restrict a, const mat4 restrict b, mat4 restrict out) {
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
out[i * 4 + j] = 0.0f;
for (int k = 0; k < 4; ++k) {
out[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
}
}
}
}
static void
mat4_invert(const mat4 restrict m, mat4 restrict out) {
float det = m[0] * (m[5] * m[10] - m[9] * m[6]) -
m[4] * (m[1] * m[10] - m[9] * m[2]) +
m[8] * (m[1] * m[6] - m[5] * m[2]);
if (fabsf(det) < 1e-8f) {
mat4_set_identity(out); // fallback
return;
}
float idet = 1.0f / det;
out[0] = (m[5] * m[10] - m[6] * m[9]) * idet;
out[1] = (m[9] * m[2] - m[1] * m[10]) * idet;
out[2] = (m[1] * m[6] - m[5] * m[2]) * idet;
out[4] = (m[6] * m[8] - m[4] * m[10]) * idet;
out[5] = (m[0] * m[10] - m[8] * m[2]) * idet;
out[6] = (m[4] * m[2] - m[0] * m[6]) * idet;
out[8] = (m[4] * m[9] - m[5] * m[8]) * idet;
out[9] = (m[8] * m[1] - m[0] * m[9]) * idet;
out[10] = (m[0] * m[5] - m[4] * m[1]) * idet;
out[3] = out[7] = out[11] = 0.0f;
out[15] = 1.0f;
out[12] = -(out[0] * m[12] + out[4] * m[13] + out[8] * m[14]);
out[13] = -(out[1] * m[12] + out[5] * m[13] + out[9] * m[14]);
out[14] = -(out[2] * m[12] + out[6] * m[13] + out[10] * m[14]);
}
static void
mat4_from_trs(const float restrict pos[3], const float restrict rot[4],
const float restrict scl[3], mat4 restrict out) {
float xx = rot[0] * rot[0];
float xy = rot[0] * rot[1];
float xz = rot[0] * rot[2];
float xw = rot[0] * rot[3];
float yy = rot[1] * rot[1];
float yz = rot[1] * rot[2];
float yw = rot[1] * rot[3];
float zz = rot[2] * rot[2];
float zw = rot[2] * rot[3];
float rm00 = 1.0f - 2.0f * (yy + zz);
float rm01 = 2.0f * (xy - zw);
float rm02 = 2.0f * (xz + yw);
float rm10 = 2.0f * (xy + zw);
float rm11 = 1.0f - 2.0f * (xx + zz);
float rm12 = 2.0f * (yz - xw);
float rm20 = 2.0f * (xz - yw);
float rm21 = 2.0f * (yz + xw);
float rm22 = 1.0f - 2.0f * (xx + yy);
out[0] = rm00 * scl[0];
out[1] = rm10 * scl[0];
out[2] = rm20 * scl[0];
out[4] = rm01 * scl[1];
out[5] = rm11 * scl[1];
out[6] = rm21 * scl[1];
out[8] = rm02 * scl[2];
out[9] = rm12 * scl[2];
out[10] = rm22 * scl[2];
out[3] = out[7] = out[11] = 0.0f;
out[12] = pos[0];
out[13] = pos[1];
out[14] = pos[2];
out[15] = 1.0f;
}
static void
mat4_decompose(const mat4 restrict m, float restrict pos[3],
float restrict rot[4], float restrict scl[3]) {
pos[0] = m[12];
pos[1] = m[13];
pos[2] = m[14];
scl[0] = sqrtf(m[0] * m[0] + m[1] * m[1] + m[2] * m[2]);
scl[1] = sqrtf(m[4] * m[4] + m[5] * m[5] + m[6] * m[6]);
scl[2] = sqrtf(m[8] * m[8] + m[9] * m[9] + m[10] * m[10]);
if (fabsf(scl[0]) < 1e-8f) {
scl[0] = 1.0f;
}
if (fabsf(scl[1]) < 1e-8f) {
scl[1] = 1.0f;
}
if (fabsf(scl[2]) < 1e-8f) {
scl[2] = 1.0f;
}
float r[3][3];
r[0][0] = m[0] / scl[0]; r[0][1] = m[4] / scl[1]; r[0][2] = m[8] / scl[2];
r[1][0] = m[1] / scl[0]; r[1][1] = m[5] / scl[1]; r[1][2] = m[9] / scl[2];
r[2][0] = m[2] / scl[0]; r[2][1] = m[6] / scl[1]; r[2][2] = m[10] / scl[2];
float trace = r[0][0] + r[1][1] + r[2][2];
if (trace > 0.0f) {
float s = 0.5f / sqrtf(trace + 1.0f);
rot[3] = 0.5f / s;
rot[0] = (r[2][1] - r[1][2]) * s;
rot[1] = (r[0][2] - r[2][0]) * s;
rot[2] = (r[1][0] - r[0][1]) * s;
} else if (r[0][0] > r[1][1] && r[0][0] > r[2][2]) {
float s = 0.5f / sqrtf(1.0f + r[0][0] - r[1][1] - r[2][2]);
rot[0] = 0.5f / s;
rot[1] = (r[1][0] + r[0][1]) * s;
rot[2] = (r[0][2] + r[2][0]) * s;
rot[3] = (r[2][1] - r[1][2]) * s;
} else if (r[1][1] > r[2][2]) {
float s = 0.5f / sqrtf(1.0f + r[1][1] - r[0][0] - r[2][2]);
rot[0] = (r[1][0] + r[0][1]) * s;
rot[1] = 0.5f / s;
rot[2] = (r[1][2] + r[2][1]) * s;
rot[3] = (r[0][2] - r[2][0]) * s;
} else {
float s = 0.5f / sqrtf(1.0f + r[2][2] - r[0][0] - r[1][1]);
rot[0] = (r[0][2] + r[2][0]) * s;
rot[1] = (r[1][2] + r[2][1]) * s;
rot[2] = 0.5f / s;
rot[3] = (r[1][0] - r[0][1]) * s;
}
float len = sqrtf(rot[0] * rot[0] + rot[1] * rot[1] + rot[2] * rot[2] + rot[3] * rot[3]);
if (len > 1e-8f) {
float inv_len = 1.0f / len;
rot[0] *= inv_len;
rot[1] *= inv_len;
rot[2] *= inv_len;
rot[3] *= inv_len;
}
}
static void
slerp(const float restrict *q0, const float restrict *q1,
float t, float restrict *qout) {
float q1_flip[4] = { q1[0], q1[1], q1[2], q1[3] };
float dot = q0[0] * q1[0] + q0[1] * q1[1] + q0[2] * q1[2] + q0[3] * q1[3];
if (dot < 0.0f) {
q1_flip[0] = -q1[0];
q1_flip[1] = -q1[1];
q1_flip[2] = -q1[2];
q1_flip[3] = -q1[3];
dot = -dot;
}
if (dot > 0.9995f) {
qout[0] = q0[0] + t * (q1_flip[0] - q0[0]);
qout[1] = q0[1] + t * (q1_flip[1] - q0[1]);
qout[2] = q0[2] + t * (q1_flip[2] - q0[2]);
qout[3] = q0[3] + t * (q1_flip[3] - q0[3]);
} else {
float omega = acosf(dot);
float so = sinf(omega);
float a = sinf(omega * (1.0f - t)) / so;
float b = sinf(omega * t) / so;
qout[0] = a * q0[0] + b * q1_flip[0];
qout[1] = a * q0[1] + b * q1_flip[1];
qout[2] = a * q0[2] + b * q1_flip[2];
qout[3] = a * q0[3] + b * q1_flip[3];
}
float norm = sqrtf(qout[0] * qout[0] + qout[1] * qout[1] + qout[2] * qout[2] + qout[3] * qout[3]);
if (norm > 0.0f) {
float inv_norm = 1.0f / norm;
qout[0] *= inv_norm;
qout[1] *= inv_norm;
qout[2] *= inv_norm;
qout[3] *= inv_norm;
}
}
static void
eval_vec3(const cgltf_animation *anim, const cgltf_node *node,
double time, cgltf_target_path_type path,
float restrict *x, float restrict *y, float restrict *z) {
cgltf_channel *chan = NULL;
for (size_t ci = 0; ci < anim->channels_count; ++ci) {
cgltf_channel *c = &anim->channels[ci];
if (c->target_node == node && c->target_path == path) {
if (chan) {
fprintf(stderr, "Multiple channels for %s on node %s\n",
(path == CGLTF_TARGET_PATH_TRANSLATION ? "translation" : "scale"),
node->name ? node->name : "unnamed");
}
chan = c;
}
}
if (!chan) {
*x = 0.0f;
*y = 0.0f;
*z = (path == CGLTF_TARGET_PATH_SCALE ? 1.0f : 0.0f);
return;
}
cgltf_sampler *samp = &anim->samplers[chan->sampler_index];
cgltf_accessor *times_acc = samp->input;
cgltf_accessor *data_acc = samp->output;
if (times_acc->count == 0) {
*x = 0.0f;
*y = 0.0f;
*z = (path == CGLTF_TARGET_PATH_SCALE ? 1.0f : 0.0f);
return;
}
float tmin_f, tmax_f;
cgltf_accessor_read_float(times_acc, 0, 0, &tmin_f);
cgltf_accessor_read_float(times_acc, times_acc->count - 1, 0, &tmax_f);
double tmin = (double)tmin_f;
double tmax = (double)tmax_f;
time = fmax(time, tmin);
time = fmin(time, tmax);
size_t count = times_acc->count;
size_t i = 0;
if (count > 1) {
size_t low = 0, high = count - 1;
while (low < high) {
size_t mid = low + (high - low) / 2;
float tmid_f;
cgltf_accessor_read_float(times_acc, mid, 0, &tmid_f);
if ((double)tmid_f <= time) {
low = mid + 1;
} else {
high = mid;
}
}
i = low - 1;
if (i >= count) {
i = count - 1;
}
if (i == (size_t)-1) {
i = 0;
}
}
float t0_f, t1_f = t0_f;
cgltf_accessor_read_float(times_acc, i, 0, &t0_f);
double t0 = (double)t0_f;
bool at_end = (i == count - 1);
float factor = 0.0f;
if (!at_end) {
cgltf_accessor_read_float(times_acc, i + 1, 0, &t1_f);
double t1 = (double)t1_f;
factor = (float)((time - t0) / (t1 - t0));
}
float vx0, vy0, vz0;
float vx1 = vx0, vy1 = vy0, vz1 = vz0;
cgltf_accessor_read_float(data_acc, i, 0, &vx0);
cgltf_accessor_read_float(data_acc, i, 1, &vy0);
cgltf_accessor_read_float(data_acc, i, 2, &vz0);
if (!at_end) {
cgltf_accessor_read_float(data_acc, i + 1, 0, &vx1);
cgltf_accessor_read_float(data_acc, i + 1, 1, &vy1);
cgltf_accessor_read_float(data_acc, i + 1, 2, &vz1);
vx0 += factor * (vx1 - vx0);
vy0 += factor * (vy1 - vy0);
vz0 += factor * (vz1 - vz0);
}
*x = vx0;
*y = vy0;
*z = vz0;
}
static void
eval_quat(const cgltf_animation *anim, const cgltf_node *node,
double time, float restrict *x, float restrict *y,
float restrict *z, float restrict *w) {
cgltf_channel *chan = NULL;
for (size_t ci = 0; ci < anim->channels_count; ++ci) {
cgltf_channel *c = &anim->channels[ci];
if (c->target_node == node && c->target_path == CGLTF_TARGET_PATH_ROTATION) {
if (chan) {
fprintf(stderr, "Multiple rotation channels on node %s\n", node->name ? node->name : "unnamed");
}
chan = c;
}
}
if (!chan) {
*x = 0.0f;
*y = 0.0f;
*z = 0.0f;
*w = 1.0f;
return;
}
cgltf_sampler *samp = &anim->samplers[chan->sampler_index];
cgltf_accessor *times_acc = samp->input;
cgltf_accessor *data_acc = samp->output;
if (times_acc->count == 0) {
*x = 0.0f;
*y = 0.0f;
*z = 0.0f;
*w = 1.0f;
return;
}
float tmin_f, tmax_f;
cgltf_accessor_read_float(times_acc, 0, 0, &tmin_f);
cgltf_accessor_read_float(times_acc, times_acc->count - 1, 0, &tmax_f);
double tmin = (double)tmin_f;
double tmax = (double)tmax_f;
time = fmax(time, tmin);
time = fmin(time, tmax);
size_t count = times_acc->count;
size_t i = 0;
if (count > 1) {
size_t low = 0, high = count - 1;
while (low < high) {
size_t mid = low + (high - low) / 2;
float tmid_f;
cgltf_accessor_read_float(times_acc, mid, 0, &tmid_f);
if ((double)tmid_f <= time) {
low = mid + 1;
} else {
high = mid;
}
}
i = low - 1;
if (i >= count) {
i = count - 1;
}
if (i == (size_t)-1){
i = 0;
}
}
float t0_f, t1_f = t0_f;
cgltf_accessor_read_float(times_acc, i, 0, &t0_f);
double t0 = (double)t0_f;
bool at_end = (i == count - 1);
float factor = 0.0f;
if (!at_end) {
cgltf_accessor_read_float(times_acc, i + 1, 0, &t1_f);
double t1 = (double)t1_f;
factor = (float)((time - t0) / (t1 - t0));
}
float q0[4], q1[4] = {0};
cgltf_accessor_read_float(data_acc, i, 0, &q0[0]);
cgltf_accessor_read_float(data_acc, i, 1, &q0[1]);
cgltf_accessor_read_float(data_acc, i, 2, &q0[2]);
cgltf_accessor_read_float(data_acc, i, 3, &q0[3]);
if (!at_end) {
cgltf_accessor_read_float(data_acc, i + 1, 0, &q1[0]);
cgltf_accessor_read_float(data_acc, i + 1, 1, &q1[1]);
cgltf_accessor_read_float(data_acc, i + 1, 2, &q1[2]);
cgltf_accessor_read_float(data_acc, i + 1, 3, &q1[3]);
float qout[4];
slerp(q0, q1, factor, qout);
q0[0] = qout[0];
q0[1] = qout[1];
q0[2] = qout[2];
q0[3] = qout[3];
}
*x = q0[0];
*y = q0[1];
*z = q0[2];
*w = q0[3];
}
static void
eval_transform(const cgltf_animation *anim, const cgltf_node *node,
double time, transform_t *xf) {
xf->tx = 0.0f; xf->ty = 0.0f; xf->tz = 0.0f;
xf->rx = 0.0f; xf->ry = 0.0f; xf->rz = 0.0f; xf->rw = 1.0f;
xf->sx = 1.0f; xf->sy = 1.0f; xf->sz = 1.0f;
eval_vec3(anim, node, time, CGLTF_TARGET_PATH_TRANSLATION, &xf->tx, &xf->ty, &xf->tz);
eval_vec3(anim, node, time, CGLTF_TARGET_PATH_SCALE, &xf->sx, &xf->sy, &xf->sz);
eval_quat(anim, node, time, &xf->rx, &xf->ry, &xf->rz, &xf->rw);
}
static inline void
set_bit(bitset_t *bs, size_t idx) {
bs->bits[idx / 64] |= (1ULL << (idx % 64));
}
static inline bool
test_bit(const bitset_t *bs, size_t idx) {
return (bs->bits[idx / 64] & (1ULL << (idx % 64))) != 0;
}
static int
cmp_int(const void *a, const void *b) {
return *(const int*)a - *(const int*)b;
}
static void
rdp_simplify(const float *data, int num, int dim, float eps,
int restrict *kept, int restrict *count_out) {
if (num <= 0) {
*count_out = 0;
return;
}
if (num == 1) {
kept[0] = 0;
*count_out = 1;
return;
}
int stack_top = 0;
g_stack[stack_top++] = (interval_t){0, num - 1};
int kept_size = 0;
while (stack_top > 0) {
interval_t iv = g_stack[--stack_top];
int start = iv.start;
int end = iv.end;
if (end - start < 1) {
if (kept_size == 0 || kept[kept_size - 1] != start) {
kept[kept_size++] = start;
}
continue;
}
float dx[DIM_MAX];
float len2 = 0.0f;
for (int d = 0; d < dim; ++d) {
dx[d] = data[end * dim + d] - data[start * dim + d];
len2 += dx[d] * dx[d];
}
if (len2 <= 0.0f) {
if (kept_size == 0 || kept[kept_size - 1] != start) {
kept[kept_size++] = start;
}
if (kept_size == 0 || kept[kept_size - 1] != end) {
kept[kept_size++] = end;
}
continue;
}
float max_dist2 = 0.0f;
int max_i = start;
float inv_len2 = 1.0f / len2;
for (int i = start + 1; i < end; ++i) {
float t_num = 0.0f;
for (int d = 0; d < dim; ++d) {
t_num += (data[i * dim + d] - data[start * dim + d]) * dx[d];
}
float t = fmaxf(0.0f, fminf(1.0f, t_num * inv_len2));
float dist2 = 0.0f;
for (int d = 0; d < dim; ++d) {
float proj = data[start * dim + d] + t * dx[d];
float dif = data[i * dim + d] - proj;
dist2 += dif * dif;
}
if (dist2 > max_dist2) {
max_dist2 = dist2;
max_i = i;
}
}
float max_dist = sqrtf(max_dist2);
if (max_dist <= eps) {
if (kept_size == 0 || kept[kept_size - 1] != start) {
kept[kept_size++] = start;
}
if (kept_size == 0 || kept[kept_size - 1] != end) {
kept[kept_size++] = end;
}
} else {
// Push right first, then left to simulate recursion order
g_stack[stack_top++] = (interval_t){max_i, end};
g_stack[stack_top++] = (interval_t){start, max_i};
}
}
// sort and unique
qsort(kept, kept_size, sizeof(int), cmp_int);
int unique_size = 0;
for (int i = 0; i < kept_size; ++i) {
if (i == 0 || kept[i] != kept[i - 1]) {
kept[unique_size++] = kept[i];
}
}
*count_out = unique_size;
}
static unsigned
qenc(const float *qin) {
float q[4];
memcpy(q, qin, sizeof(float) * 4);
float qabs[4];
for (int i = 0; i < 4; ++i) {
qabs[i] = fabsf(q[i]);
}
int top = 0;
for (int i = 1; i < 4; ++i) {
if (qabs[i] > qabs[top]) {
top = i;
}
}
unsigned msk = 511u;
unsigned neg = (q[top] < 0.0f);
unsigned ret = (unsigned)top;
for (int i = 0; i < 4; ++i) {
if (i == top) {
continue;
}
unsigned negbit = ((q[i] < 0.0f) ^ neg);
unsigned mag = (unsigned)(msk * (qabs[i] * 1.414213562f) + 0.5f);
if (mag > msk) {
mag = msk;
}
ret = (ret << 10u) | (negbit << 9u) | mag;
}
return ret;
}
extern int
main(int argc, char **argv) {
if (argc < 3) {
fprintf(stderr, "Usage: %s <gltf_file> <output_c_file> [clip_name] [joint_name1 [joint_name2 ...]]\n", argv[0]);
fprintf(stderr, "If clip_name is provided, joint names follow it. Omitting joint names uses all skin joints.\n");
return 1;
}
char *gltf_path = argv[1];
char *out_path = argv[2];
char *clip_name = (argc > 3) ? argv[3] : "def";
int first_joint_arg = (argc > 3) ? 4 : 3;
int num_joint_args = argc - first_joint_arg;
if (num_joint_args > MAX_JOINTS) {
fprintf(stderr, "Too many joint names provided: %d (max %d)\n", num_joint_args, MAX_JOINTS);
return 1;
}
cgltf_data *scene = 0;
cgltf_options options = { 0 };
cgltf_result res = cgltf_parse_file(&options, gltf_path, &scene);
if (res != cgltf_result_success) {
fprintf(stderr, "Failed to parse GLTF: %d\n", res);
return 1;
}
res = cgltf_load_buffers(&options, scene, gltf_path);
if (res != cgltf_result_success) {
fprintf(stderr, "Failed to load GLTF buffers: %d\n", res);
cgltf_free(scene);
return 1;
}
cgltf_skin *skin = (scene->skins_count > 0) ? scene->skins : NULL;
if (!skin || skin->joints_count == 0) {
fprintf(stderr, "No skin or joints found\n");
cgltf_free(scene);
return 1;
}
int g_all_cnt = (int)skin->joints_count;
if (g_all_cnt > MAX_ALL_JOINTS) {
fprintf(stderr, "Too many all joints: %d (max %d)\n", g_all_cnt, MAX_ALL_JOINTS);
cgltf_free(scene);
return 1;
}
// Build parents
memset(g_parents, -1, sizeof(g_parents));
for (int k = 0; k < g_all_cnt; ++k) {
cgltf_node *node_k = skin->joints[k];
for (size_t c = 0; c < node_k->children_count; ++c) {
cgltf_node *ch = node_k->children[c];
int cidx = -1;
for (int jj = 0; jj < g_all_cnt; ++jj) {
if (skin->joints[jj] == ch) {
cidx = jj;
break;
}
}
if (cidx != -1) {
g_parents[cidx] = k;
}
}
}
// Selected hashes
uint32_t selected_hashes[MAX_JOINTS] = {0};
for (int s = 0; s < num_joint_args; ++s) {
selected_hashes[s] = fnv1a_32(argv[first_joint_arg + s]);
}
// Map to selected
int sel_cnt = 0;
unsigned long long found[MAX_JOINTS/64] = {0};
if (num_joint_args == 0) {
for (int aj = 0; aj < g_all_cnt; ++aj) {
g_bone_indices[aj] = aj;
sel_cnt++;
}
} else {
for (int aj = 0; aj < g_all_cnt; ++aj) {
const char *n = skin->joints[aj]->name ? skin->joints[aj]->name : "";
uint32_t h = fnv1a_32(n);
for (int s = 0; s < num_joint_args; ++s) {
if (selected_hashes[s] == h && !(found[s/64] & (1llu << (s & 63)))) {
found[s/64] = (1llu << (s & 63));
g_bone_indices[sel_cnt] = aj;
sel_cnt++;
break;
}
}
}
int missing = 0;
for (int s = 0; s < num_joint_args; ++s) {
if (!(found[s/64] & (1llu << (s & 63)))) {
missing++;
}
}
if (missing > 0) {
fprintf(stderr, "Warning: %d skeleton joint names not found in skin\n", missing);
}
}
int joint_cnt = sel_cnt;
if (joint_cnt == 0 || joint_cnt > MAX_JOINTS) {
fprintf(stderr, "Invalid selected joint count: %d (must be 1-%d)\n", joint_cnt, MAX_JOINTS);
cgltf_free(scene);
return 1;
}
if (num_joint_args > 0) {
printf("Using %d selected joints (out of %d provided; %d total in skin)\n", joint_cnt, num_joint_args, g_all_cnt);
}
cgltf_animation *anim = 0;
size_t clip_len = strlen(clip_name);
if (clip_len > 0) {
for (size_t s = 0; s < scene->animations_count; ++s) {
cgltf_animation *candidate = &scene->animations[s];
if (candidate->name.length == clip_len &&
strncmp(candidate->name.data, clip_name, clip_len) == 0) {
anim = candidate;
break;
}
}
} else if (scene->animations_count > 0) {
anim = &scene->animations[0];
}
if (!anim) {
fprintf(stderr, "No animation available\n");
cgltf_free(scene);
return 1;
}
double duration_sec = 0.0;
int has_anim_data = 0;
for (size_t s = 0; s < anim->samplers_count; ++s) {
cgltf_accessor *times_acc = anim->samplers[s].input;
if (times_acc && times_acc->count > 0) {
has_anim_data = 1;
float last_time_f;
cgltf_accessor_read_float(times_acc, times_acc->count - 1, 0, &last_time_f);
double last = (double)last_time_f;
if (last > duration_sec) {
duration_sec = last;
}
}
}
if (!has_anim_data || duration_sec <= 0.0) {
duration_sec = 0.0;
}
double fps = 60.0;
int frame_cnt = 1;
if (duration_sec > 0.0) {
frame_cnt = (int)roundf(duration_sec * fps) + 1;
if (frame_cnt > MAX_FRAMES) {
frame_cnt = MAX_FRAMES;
}
}
double dt_sec = (frame_cnt > 1) ? duration_sec / (frame_cnt - 1.0) : 0.0;
for (int i = 0; i < frame_cnt; ++i) {
double time_sec = (i == frame_cnt - 1) ? duration_sec : i * dt_sec;
// Compute all local TRS
float all_local_pos[MAX_ALL_JOINTS * 3];
float all_local_rot[MAX_ALL_JOINTS * 4];
float all_local_scl[MAX_ALL_JOINTS * 3];
for (int aj = 0; aj < g_all_cnt; ++aj) {
cgltf_node *node = skin->joints[aj];
transform_t xf;
eval_transform(anim, node, time_sec, &xf);
int off = aj * 3;
all_local_pos[off + 0] = xf.tx;
all_local_pos[off + 1] = xf.ty;
all_local_pos[off + 2] = xf.tz;
off = aj * 4;
all_local_rot[off + 0] = xf.rx;
all_local_rot[off + 1] = xf.ry;
all_local_rot[off + 2] = xf.rz;
all_local_rot[off + 3] = xf.rw;
off = aj * 3;
all_local_scl[off + 0] = xf.sx;
all_local_scl[off + 1] = xf.sy;
all_local_scl[off + 2] = xf.sz;
}
// Compute globals
{
mat4 id;
mat4_set_identity(id);
// Set root globals
int all_cnt = (int)skin->joints_count;
for (int r = 0; r < all_cnt; ++r) {
if (g_parents[r] != -1) continue;
float lpos[3] = {all_local_pos[r * 3 + 0], all_local_pos[r * 3 + 1], all_local_pos[r * 3 + 2]};
float lrot[4] = {all_local_rot[r * 4 + 0], all_local_rot[r * 4 + 1], all_local_rot[r * 4 + 2], all_local_rot[r * 4 + 3]};
float lscl[3] = {all_local_scl[r * 3 + 0], all_local_scl[r * 3 + 1], all_local_scl[r * 3 + 2]};
mat4 local_m;
mat4_from_trs(lpos, lrot, lscl, local_m);
mat4 root_g;
mat4_mul(id, local_m, root_g);
memcpy(g_globals + r * MAT4_SIZE, root_g, sizeof(mat4));
}
// Iterative DFS traversal using stack to simulate recursion
int stack[MAX_ALL_JOINTS];
int stack_top = 0;
for (int r = 0; r < all_cnt; ++r) {
if (g_parents[r] == -1) {
stack[stack_top++] = r;
}
}
while (stack_top > 0) {
int idx = stack[--stack_top];
// Process children in reverse order to match recursion depth-first order
cgltf_node *node = skin->joints[idx];
for (int cc = (int)node->children_count - 1; cc >= 0; --cc) {
cgltf_node *chn = node->children[cc];
int cidx = -1;
for (int jj = 0; jj < all_cnt; ++jj) {
if (skin->joints[jj] == chn) {
cidx = jj;
break;
}
}
if (cidx == -1) {
continue;
}
float lpos[3] = {{all_local_pos[cidx * 3 + 0], all_local_pos[cidx * 3 + 1], all_local_pos[cidx * 3 + 2]};
float lrot[4] = {all_local_rot[cidx * 4 + 0], all_local_rot[cidx * 4 + 1], all_local_rot[cidx * 4 + 2], all_local_rot[cidx * 4 + 3]};
float lscl[3] = {all_local_scl[cidx * 3 + 0], all_local_scl[cidx * 3 + 1], all_local_scl[cidx * 3 + 2]};
mat4 local_m;
mat4_from_trs(lpos, lrot, lscl, local_m);
mat4 parent_g;
memcpy(parent_g, g_globals + idx * MAT4_SIZE, sizeof(mat4));
mat4 g;
mat4_mul(parent_g, local_m, g);
memcpy(g_globals + cidx * MAT4_SIZE, g, sizeof(mat4));
// Push child to stack (safe since MAX_ALL_JOINTS limit)
stack[stack_top++] = cidx;
}
}
}
// Back-compute locals for selected
for (int j = 0; j < joint_cnt; ++j) {
int idx = g_bone_indices[j];
int off = j * frame_cnt + i;
mat4 g;
memcpy(g, g_globals + idx * MAT4_SIZE, sizeof(mat4));
float lpos[3], lrot[4], lscl[3];
if (g_parents[idx] == -1) {
mat4_decompose(g, lpos, lrot, lscl);
} else {
int pidx = g_parents[idx];
mat4 pg;
memcpy(pg, g_globals + pidx * MAT4_SIZE, sizeof(mat4));
mat4 parent_inv;
mat4_invert(pg, parent_inv);
mat4 local_m;
mat4_mul(parent_inv, g, local_m);
mat4_decompose(local_m, lpos, lrot, lscl);
}
g_pos_x[off] = lpos[0];
g_pos_y[off] = lpos[1];
g_pos_z[off] = lpos[2];
g_rot_x[off] = lrot[0];
g_rot_y[off] = lrot[1];
g_rot_z[off] = lrot[2];
g_rot_w[off] = lrot[3];
g_scl_s[off] = (lscl[0] + lscl[1] + lscl[2]) / 3.0f;
}
}
// Simplify
float eps_pos = 0.01f;
float eps_rot = 0.02f;
float eps_scl = 0.001f;
memset(g_used_pos_bits.bits, 0, sizeof(g_used_pos_bits.bits));
memset(g_used_rot_bits.bits, 0, sizeof(g_used_rot_bits.bits));
memset(g_used_scl_bits.bits, 0, sizeof(g_used_scl_bits.bits));
for (int j = 0; j < joint_cnt; ++j) {
// Position
for (int f = 0; f < frame_cnt; ++f) {
int off = j * frame_cnt + f;
g_tmp_pos[f * 3 + 0] = g_pos_x[off];
g_tmp_pos[f * 3 + 1] = g_pos_y[off];
g_tmp_pos[f * 3 + 2] = g_pos_z[off];
}
int kept_count;
rdp_simplify(g_tmp_pos, frame_cnt, 3, eps_pos, g_kept, &kept_count);
for (int kk = 0; kk < kept_count; ++kk) {
set_bit(&g_used_pos_bits, g_kept[kk]);
}
// Rotation
for (int f = 0; f < frame_cnt; ++f) {
int off = j * frame_cnt + f;
g_tmp_rot[f * 4 + 0] = g_rot_x[off];
g_tmp_rot[f * 4 + 1] = g_rot_y[off];
g_tmp_rot[f * 4 + 2] = g_rot_z[off];
g_tmp_rot[f * 4 + 3] = g_rot_w[off];
}
rdp_simplify(g_tmp_rot, frame_cnt, 4, eps_rot, g_kept, &kept_count);
for (int kk = 0; kk < kept_count; ++kk) {
set_bit(&g_used_rot_bits, g_kept[kk]);
}
// Scale
for (int f = 0; f < frame_cnt; ++f) {
int off = j * frame_cnt + f;
g_tmp_scl[f] = g_scl_s[off];
}
rdp_simplify(g_tmp_scl, frame_cnt, 1, eps_scl, g_kept, &kept_count);
for (int kk = 0; kk < kept_count; ++kk) {
set_bit(&g_used_scl_bits, g_kept[kk]);
}
}
// Build key lists
int num_pos_keys = 0;
for (int f = 0; f < frame_cnt; ++f) {
if (test_bit(&g_used_pos_bits, f)) {
g_key_pos_list[num_pos_keys++] = f;
}
}
int num_rot_keys = 0;
for (int f = 0; f < frame_cnt; ++f) {
if (test_bit(&g_used_rot_bits, f)) {
g_key_rot_list[num_rot_keys++] = f;
}
}
int num_scl_keys = 0;
for (int f = 0; f < frame_cnt; ++f) {
if (test_bit(&g_used_scl_bits, f)) {
g_key_scl_list[num_scl_keys++] = f;
}
}
// Compute frame_to_key
for (int i = 0; i < frame_cnt; ++i) {
int k = num_pos_keys - 1;
while (k >= 0 && g_key_pos_list[k] > i) {
--k;
}
g_frame_to_key_pos[i] = (unsigned short)(k < 0 ? 0 : k);
k = num_rot_keys - 1;
while (k >= 0 && g_key_rot_list[k] > i) {
--k;
}
g_frame_to_key_rot[i] = (unsigned short)(k < 0 ? 0 : k);
k = num_scl_keys - 1;
while (k >= 0 && g_key_scl_list[k] > i) {
--k;
}
g_frame_to_key_scl[i] = (unsigned short)(k < 0 ? 0 : k);
}
// Compute quantization params
float pos_scl_val = 1.0f;
float scl_scl_val = 1.0f;
float rot_scl_val = 1.0f;
float pos_off = 0.0f;
float scl_off = 0.0f;
float rot_off = 0.0f;
if (num_pos_keys > 0) {
float pmin = INFINITY;
float pmax = -INFINITY;
for (int kk = 0; kk < num_pos_keys; ++kk) {
int f = g_key_pos_list[kk];
for (int j = 0; j < joint_cnt; ++j) {
int off = j * frame_cnt + f;
pmin = fminf(pmin, g_pos_x[off]);
pmin = fminf(pmin, g_pos_y[off]);
pmin = fminf(pmin, g_pos_z[off]);
pmax = fmaxf(pmax, g_pos_x[off]);
pmax = fmaxf(pmax, g_pos_y[off]);
pmax = fmaxf(pmax, g_pos_z[off]);
}
}
float range = pmax - pmin;
pos_scl_val = (range > 0.0f) ? range / 65535.0f : 1.0f;
pos_off = pmin;
}
if (num_scl_keys > 0) {
float smin = INFINITY;
float smax = -INFINITY;
for (int kk = 0; kk < num_scl_keys; ++kk) {
int f = g_key_scl_list[kk];
for (int j = 0; j < joint_cnt; ++j) {
int off = j * frame_cnt + f;
smin = fminf(smin, g_scl_s[off]);
smax = fmaxf(smax, g_scl_s[off]);
}
}
float range = smax - smin;
scl_scl_val = (range > 0.0f) ? range / 65535.0f : 1.0f;
scl_off = smin;
}
// Generate keys
for (int kk = 0; kk < num_pos_keys; ++kk) {
int f = g_key_pos_list[kk];
for (int j = 0; j < joint_cnt; ++j) {
int idx = kk * joint_cnt + j;
int off = j * frame_cnt + f;
float valx = g_pos_x[off];
float qx = (valx - pos_off) / pos_scl_val;
g_keys_pos_x[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qx)));
float valy = g_pos_y[off];
float qy = (valy - pos_off) / pos_scl_val;
g_keys_pos_y[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qy)));
float valz = g_pos_z[off];
float qz = (valz - pos_off) / pos_scl_val;
g_keys_pos_z[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qz)));
}
}
for (int kk = 0; kk < num_scl_keys; ++kk) {
int f = g_key_scl_list[kk];
for (int j = 0; j < joint_cnt; ++j) {
int idx = kk * joint_cnt + j;
int off = j * frame_cnt + f;
float vals = g_scl_s[off];
float qs = (vals - scl_off) / scl_scl_val;
g_keys_scl[idx] = (unsigned short)roundf(fmaxf(0.0f, fminf(65535.0f, qs)));
}
}
for (int kk = 0; kk < num_rot_keys; ++kk) {
int f = g_key_rot_list[kk];
for (int j = 0; j < joint_cnt; ++j) {
int idx = kk * joint_cnt + j;
int off = j * frame_cnt + f;
float qin[4] = {g_rot_x[off], g_rot_y[off], g_rot_z[off], g_rot_w[off]};
g_keys_rot[idx] = qenc(qin);
}
}
// Output to C file
FILE *out = fopen(out_path, "w");
if (!out) {
perror("Failed to open output file");
cgltf_free(scene);
return 1;
}
size_t total_pos = (size_t)num_pos_keys * joint_cnt;
fprintf(out, "static const unsigned short anm_%s_pos_x[] = {\n", clip_name);
for (size_t ii = 0; ii < total_pos; ii += 8) {
bool line_start = true;
for (int jj = 0; jj < 8 && (ii + jj) < total_pos; ++jj) {
if (!line_start) fputc(' ', out);
fprintf(out, "0x%04x,", g_keys_pos_x[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
fprintf(out, "static const unsigned short anm_%s_pos_y[] = {\n", clip_name);
for (size_t ii = 0; ii < total_pos; ii += 8) {
bool line_start = true;
for (int jj = 0; jj < 8 && (ii + jj) < total_pos; ++jj) {
if (!line_start) fputc(' ', out);
fprintf(out, "0x%04x,", g_keys_pos_y[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
fprintf(out, "static const unsigned short anm_%s_pos_z[] = {\n", clip_name);
for (size_t ii = 0; ii < total_pos; ii += 8) {
bool line_start = true;
for (int jj = 0; jj < 8 && (ii + jj) < total_pos; ++jj) {
if (!line_start) fputc(' ', out);
fprintf(out, "0x%04x,", g_keys_pos_z[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
size_t total_rot = (size_t)num_rot_keys * joint_cnt;
fprintf(out, "static const unsigned anm_%s_rot[] = {\n", clip_name);
for (size_t ii = 0; ii < total_rot; ii += 8) {
bool line_start = true;
for (int jj = 0; jj < 8 && (ii + jj) < total_rot; ++jj) {
if (!line_start) {
fputc(' ', out);
}
fprintf(out, "0x%08x,", g_keys_rot[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
size_t total_scl = (size_t)num_scl_keys * joint_cnt;
fprintf(out, "static const unsigned short anm_%s_scl[] = {\n", clip_name);
for (size_t ii = 0; ii < total_scl; ii += 8) {
bool line_start = true;
for (int jj = 0; jj < 8 && (ii + jj) < total_scl; ++jj) {
if (!line_start) {
fputc(' ', out);
}
fprintf(out, "0x%04x,", g_keys_scl[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
fprintf(out, "static const unsigned short anm_%s_frame_to_key_pos[] = {\n", clip_name);
for (int ii = 0; ii < frame_cnt; ii += 16) {
bool line_start = true;
for (int jj = 0; jj < 16 && (ii + jj) < frame_cnt; ++jj) {
if (!line_start) {
fputc(' ', out);
}
fprintf(out, "%hu,", g_frame_to_key_pos[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
fprintf(out, "static const unsigned short anm_%s_frame_to_key_rot[] = {\n", clip_name);
for (int ii = 0; ii < frame_cnt; ii += 16) {
bool line_start = true;
for (int jj = 0; jj < 16 && (ii + jj) < frame_cnt; ++jj) {
if (!line_start) {
fputc(' ', out);
}
fprintf(out, "%hu,", g_frame_to_key_rot[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
fprintf(out, "static const unsigned short anm_%s_frame_to_key_scl[] = {\n", clip_name);
for (int ii = 0; ii < frame_cnt; ii += 16) {
bool line_start = true;
for (int jj = 0; jj < 16 && (ii + jj) < frame_cnt; ++jj) {
if (!line_start) {
fputc(' ', out);
}
fprintf(out, "%hu,", g_frame_to_key_scl[ii + jj]);
line_start = false;
}
fputc('\n', out);
}
fprintf(out, "};\n\n");
fprintf(out, "static const struct anm_clip anm_%s_clip = {\n", clip_name);
fprintf(out, " .joint_cnt = %d,\n", joint_cnt);
fprintf(out, " .frame_cnt = %d,\n", frame_cnt);
fprintf(out, " .off = {%.6ff, %.6ff, %.6ff},\n", pos_off, rot_off, scl_off);
fprintf(out, " .scl = {%.6ff, %.6ff, %.6ff},\n", pos_scl_val, rot_scl_val, scl_scl_val);
fprintf(out, " .keys = {\n");
fprintf(out, " .pos_x = anm_%s_pos_x,\n", clip_name);
fprintf(out, " .pos_y = anm_%s_pos_y,\n", clip_name);
fprintf(out, " .pos_z = anm_%s_pos_z,\n", clip_name);
fprintf(out, " .rot_pos = anm_%s_rot,\n", clip_name);
fprintf(out, " .scl_s = anm_%s_scl,\n", clip_name);
fprintf(out, " },\n");
fprintf(out, " .blks = {\n");
fprintf(out, " .frame_to_key_pos = anm_%s_frame_to_key_pos,\n", clip_name);
fprintf(out, " .frame_to_key_rot = anm_%s_frame_to_key_rot,\n", clip_name);
fprintf(out, " .frame_to_key_scl = anm_%s_frame_to_key_scl,\n", clip_name);
fprintf(out, " },\n");
fprintf(out, "};\n");
fclose(out);
cgltf_free(scene);
cgltf_options_destroy(&options);
printf("Generated animation clip '%s' with %d frames, pos_keys=%d, rot_keys=%d, scl_keys=%d\n",
clip_name, frame_cnt, num_pos_keys, num_rot_keys, num_scl_keys);
return 0;
}
#include "cgltf.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <float.h>
#include <stdbool.h>
#include <stdint.h>
#include <ctype.h>
#define MAX_JOINTS 256
typedef float mat4[16];
static uint32_t
fnv1a_32(const char *str) {
uint32_t hash = 0x811c9dc5u;
const unsigned char *s = (const unsigned char *)str;
while (*s != '\0') {
hash ^= *s++;
hash *= 0x01000193u;
}
return hash;
}
static void
mat4_id(mat4 m) {
m[0] = m[5] = m[10] = m[15] = 1.0f;
m[1] = m[2] = m[3] = m[4] = m[6] = m[7] = 0.0f;
m[8] = m[9] = m[11] = m[12] = m[13] = m[14] = 0.0f;
}
static void
mat4_mul(const mat4 restrict a, const mat4 restrict b, mat4 restrict out) {
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
out[i * 4 + j] = 0.0f;
for (int k = 0; k < 4; ++k) {
out[i * 4 + j] += a[i * 4 + k] * b[k * 4 + j];
}
}
}
}
static int
mat4_try_invert(const mat4 restrict m, mat4 restrict out) {
float det = m[0] * (m[5] * m[10] - m[9] * m[6]) -
m[4] * (m[1] * m[10] - m[9] * m[2]) +
m[8] * (m[1] * m[6] - m[5] * m[2]);
if (fabsf(det) < 1e-8f) {
mat4_id(out); // fallback
return 0;
}
float idet = 1.0f / det;
out[0] = (m[5] * m[10] - m[6] * m[9]) * idet;
out[1] = (m[9] * m[2] - m[1] * m[10]) * idet;
out[2] = (m[1] * m[6] - m[5] * m[2]) * idet;
out[4] = (m[6] * m[8] - m[4] * m[10]) * idet;
out[5] = (m[0] * m[10] - m[8] * m[2]) * idet;
out[6] = (m[4] * m[2] - m[0] * m[6]) * idet;
out[8] = (m[4] * m[9] - m[5] * m[8]) * idet;
out[9] = (m[8] * m[1] - m[0] * m[9]) * idet;
out[10] = (m[0] * m[5] - m[4] * m[1]) * idet;
out[3] = out[7] = out[11] = 0.0f;
out[15] = 1.0f;
out[12] = -(out[0] * m[12] + out[4] * m[13] + out[8] * m[14]);
out[13] = -(out[1] * m[12] + out[5] * m[13] + out[9] * m[14]);
out[14] = -(out[2] * m[12] + out[6] * m[13] + out[10] * m[14]);
return 1;
}
static void
mat4_compose(const float *restrict rot, const float *restrict pos,
const float *restrict scl, mat4 out) {
float xx = rot[0] * rot[0];
float xy = rot[0] * rot[1];
float xz = rot[0] * rot[2];
float xw = rot[0] * rot[3];
float yy = rot[1] * rot[1];
float yz = rot[1] * rot[2];
float yw = rot[1] * rot[3];
float zz = rot[2] * rot[2];
float zw = rot[2] * rot[3];
float rm00 = 1.0f - 2.0f * (yy + zz);
float rm01 = 2.0f * (xy - zw);
float rm02 = 2.0f * (xz + yw);
float rm10 = 2.0f * (xy + zw);
float rm11 = 1.0f - 2.0f * (xx + zz);
float rm12 = 2.0f * (yz - xw);
float rm20 = 2.0f * (xz - yw);
float rm21 = 2.0f * (yz + xw);
float rm22 = 1.0f - 2.0f * (xx + yy);
out[0] = rm00 * scl[0];
out[1] = rm10 * scl[0];
out[2] = rm20 * scl[0];
out[4] = rm01 * scl[1];
out[5] = rm11 * scl[1];
out[6] = rm21 * scl[1];
out[8] = rm02 * scl[2];
out[9] = rm12 * scl[2];
out[10] = rm22 * scl[2];
out[3] = out[7] = out[11] = 0.0f;
out[12] = pos[0];
out[13] = pos[1];
out[14] = pos[2];
out[15] = 1.0f;
}
extern int
main(int argc, char **argv) {
if (argc < 4) {
fprintf(stderr, "Usage: %s <gltf_file> <output_c_file> <skeleton_name> [skin_name]\n", argv[0]);
return 1;
}
const char *gltf_path = argv[1];
const char *out_path = argv[2];
const char *skeleton_name = argv[3];
const char *skin_name = (argc > 4) ? argv[4] : 0;
/* load and parse the gltf file */
cgltf_options opt = {0};
cgltf_result res = cgltf_parse_file(&opt, gltf_path, &data);
if (res != cgltf_result_success) {
fprintf(stderr, "[GLTF]: Failed to parse GLTF: %d\n", res);
return 1;
}
cgltf_data *data = 0;
res = cgltf_load_buffers(&opt, data, gltf_path);
if (res != cgltf_result_success) {
fprintf(stderr, "[GLTF]: Failed to load GLTF buffers: %d\n", res);
return 1;
}
/* try to find the skin */
const cgltf_skin *skn = 0;
if (skin_name && skin_name[0] != '\0') {
for (size_t s = 0u; s < data->skins_count; ++s) {
const cgltf_skin *cur = &data->skins[s];
if (cur->name && !strcmp(cur->name, skin_name)) {
skn = cur;
break;
}
}
if (!skn) {
fprintf(stderr, "[GLTF]: Specified skin '%s' not found; falling back to first skin.\n", skin_name);
}
}
if (!skn && data->skins_count > 0) {
skn = &data->skins[0];
}
if (!skn || skn->joints_count == 0) {
fprintf(stderr, "[GLTF]: No skin or joints found\n");
return 1;
}
int skin_jnt_cnt = (int)skn->joints_count;
if (skin_jnt_cnt > MAX_JOINTS) {
fprintf(stderr, "[GLTF]: Too many joints: %d (max %d)\n", skin_jnt_cnt, MAX_JOINTS);
return 1;
}
/* try to find skin root node */
const cgltf_node *root = 0;
for (size_t i = 0; i < skin_jnt_cnt; ++i) {
const cgltf_node *node = skn->joints[i];
if (!node->parent) {
root = node;
break;
}
}
if (!root) {
for (size_t i = 0; i < skin_jnt_cnt; ++i) {
const cgltf_node *node = skn->joints[i];
if (node->parent && !node->parent->parent) {
root = node->parent;
break;
}
}
}
if (!root) {
fprintf(stderr, "[GLTF]: Couldn't find the root node in skin: '%s'\n", skin_name);
return 1;
}
/* iterate over hierarchy */
int bone_cnt = 0;
struct skel_bone {
int parent;
unsigned hash;
const char *name;
mat4 local;
mat4 skin_to_world;
}
skeleton[MAX_JOINTS];
{
int stk_top = 0;
struct stk_elm {
int parent;
const cgltf_node *node;
mat4 skin_to_world;
} stk[MAX_JOINTS];
{
stk[stk_top].parent = -1;
stk[stk_top].node = root;
mat4_id(stk[stk_top].skin_to_world);
stk_top++;
}
mat4 node_to_world[MAX_JOINTS];
while (stk_top > 1) {
const struct stk_elm elm = stk[--stk_top];
const cgltf_node *cur = elm->node;
/* skip non-skin joints */
size_t skin_jnt_idx = 0;
for (; skin_jnt_idx < skin_jnt_cnt; ++skin_jnt_idx) {
const cgltf_node *node = skn->joints[skin_jnt_idx];
if (cur == node) {
break;
}
}
if (skin_jnt_idx >= skin_jnt_cnt) {
continue;
}
int bone_idx = bone_cnt++;
struct skel_bone *bone = skeleton[bone_idx];
bone->parent = elm->parent;
bone->name = cur->name;
bone->hash = fnv1a_32(cur->name);
mat4 jnt_to_world;
if (cur->parent == -1) {
memcpy(bone->local, cur->skin_to_world, sizeof(mat4));
mat4_id(jnt_to_world);
} else {
float pos[3] = {0,0,0};
if (node->has_translation) {
pos[0] = node->translation[0];
pos[1] = node->translation[1];
pos[2] = node->translation[2];
}
float rot[4] = {0,0,0,1.0f};
if (node->has_rotation) {
rot[0] = node->rotation[0];
rot[1] = node->rotation[1];
rot[2] = node->rotation[2];
rot[3] = node->rotation[3];
}
float scl[3] = {1.0f,1.0f,1.0f};
if (node->has_scale) {
scl[0] = node->scale[0];
scl[1] = node->scale[1];
scl[2] = node->scale[2];
}
mat4_compose(rot, pos, scl, bone->local);
mat4_mul(bone->local, node_to_world[cur->parent], jnt_to_world);
}
memcpy(&node_to_world[bone_idx], jnt_to_world, sizeof(mat4));
// we want to have the skin to bone matrix so first create the world to skin matrix
mat4 world_to_skin;
if (!mat4_try_invert(cur->skin_to_world, world_to_skin)) {
fprintf(stderr, "[GLTF]: failed to invert skin to world matrix in skin '%s' at joint '%s'\n", skin_name, bone->name);
return 1;
}
// concatenate the bone to world and the world to skin matrices to one bone to skin matrix
mat4 bone_to_skin;
mat4_mul(jnt_to_world, world_to_skin, bone_to_skin);
if (!mat4_try_invert(bone_to_skin, bone->skin_to_world)) {
fprintf(stderr, "[GLTF]: failed to invert the bone to skin matrix in skin '%s' at joint '%s'\n", skin_name, bone->name);
return 1;
}
// process all children (reverse to keep correct sequence through stack)
for (size_t i = cur->children_count; i > 0; --i) {
const struct stk_elm *elm = &stk[stk_top++];
elm->node = cur->children[i];
elm->parent = bone_idx;
memcpy(elm->skin_to_world, world_to_skin, sizeof(mat4));
}
}
}
// Output
FILE *out = fopen(out_path, "w");
if (!out) {
perror("Failed to open output file");
cgltf_free(data);
return 1;
}
fprintf(out, "static const struct bone skl_%s[] = {\n", skeleton_name);
for (int i = 0; i < joint_cnt; ++i) {
const struct skel_bone *bone = &skeleton[i];
fprintf(out, " { .parent = %d, .hash = 0x%08x, .name = \"%s\", .skin_to_world = {", bone->parent, bone->hash, bone->name);
fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[0], bone->skin_to_world[1], bone->skin_to_world[2]);
fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[4], bone->skin_to_world[5], bone->skin_to_world[6]);
fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->skin_to_world[8], bone->skin_to_world[9], bone->skin_to_world[10]);
fprintf(out, " { %.6ff, %.6ff, %.6ff }", bone->skin_to_world[12], bone->skin_to_world[13], bone->skin_to_world[14]);
fprintf(out, " },\n", ",");
fprintf(out, " .pose = { %.6ff, %.6ff, %.6ff },", bone->local[0], bone->local[1], bone->local[2]);
fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->local[4], bone->local[5], bone->local[6]);
fprintf(out, " { %.6ff, %.6ff, %.6ff },", bone->local[8], bone->local[9], bone->local[10]);
fprintf(out, " { %.6ff, %.6ff, %.6ff }", bone->local[12], bone->local[13], bone->local[14]);
fprintf(out, " } }\n");
}
fprintf(out, "};\n");
fclose(out);
cgltf_free(data);
cgltf_options_destroy(&opt);
const char *skin_used = skin->name ? skin->name : "default";
printf("Generated skeleton '%s' with %d bones from skin '%s'\n", skeleton_name, joint_cnt, skin_used);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment