Skip to content

Instantly share code, notes, and snippets.

@ripatel-fd
Created August 19, 2025 22:49
Show Gist options
  • Select an option

  • Save ripatel-fd/19969d53cd0fd0b1c5be98ecf6090b37 to your computer and use it in GitHub Desktop.

Select an option

Save ripatel-fd/19969d53cd0fd0b1c5be98ecf6090b37 to your computer and use it in GitHub Desktop.
AVX512 barf
lthash[ 0x0 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x0 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x0 ] ), 0 ) );
lthash[ 0x1 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x1 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x1 ] ), 0 ) );
lthash[ 0x2 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x2 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x2 ] ), 0 ) );
lthash[ 0x3 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x3 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x3 ] ), 0 ) );
lthash[ 0x4 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x4 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x4 ] ), 0 ) );
lthash[ 0x5 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x5 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x5 ] ), 0 ) );
lthash[ 0x6 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x6 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x6 ] ), 0 ) );
lthash[ 0x7 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x7 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x7 ] ), 0 ) );
lthash[ 0x8 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x0 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x0 ] ), 0 ) );
lthash[ 0x9 ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x1 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x1 ] ), 0 ) );
lthash[ 0xa ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x2 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x2 ] ), 0 ) );
lthash[ 0xb ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x3 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x3 ] ), 0 ) );
lthash[ 0xc ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x4 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x4 ] ), 0 ) );
lthash[ 0xd ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x5 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x5 ] ), 0 ) );
lthash[ 0xe ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x6 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x6 ] ), 0 ) );
lthash[ 0xf ] = (ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0x55555555, d[ 0x7 ] ), 0 ) ) +
(ushort)_mm256_reduce_add_epi16( _mm512_extracti64x4_epi64( _mm512_maskz_mov_epi16( 0xaaaaaaaa, d[ 0x7 ] ), 0 ) );
__m512i s0[8] = {
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[0], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[0], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[1], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[1], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[2], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[2], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[3], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[3], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[4], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[4], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[5], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[5], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[6], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[6], 0x1b), 0x1b)),
_mm512_add_epi16(_mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[7], 0xb1), 0xb1), _mm512_shufflelo_epi16(_mm512_shufflehi_epi16(d[7], 0x1b), 0x1b)),
};
__m512i s1[8] = {
_mm512_add_epi16(s0[0], _mm512_shuffle_epi32(s0[0], 0x4e)),
_mm512_add_epi16(s0[1], _mm512_shuffle_epi32(s0[1], 0x4e)),
_mm512_add_epi16(s0[2], _mm512_shuffle_epi32(s0[2], 0x4e)),
_mm512_add_epi16(s0[3], _mm512_shuffle_epi32(s0[3], 0x4e)),
_mm512_add_epi16(s0[4], _mm512_shuffle_epi32(s0[4], 0x4e)),
_mm512_add_epi16(s0[5], _mm512_shuffle_epi32(s0[5], 0x4e)),
_mm512_add_epi16(s0[6], _mm512_shuffle_epi32(s0[6], 0x4e)),
_mm512_add_epi16(s0[7], _mm512_shuffle_epi32(s0[7], 0x4e)),
};
__m512i s2[8] = {
_mm512_add_epi16(s1[0], _mm512_shuffle_i32x4(s1[0], s1[0], 0xb1)),
_mm512_add_epi16(s1[1], _mm512_shuffle_i32x4(s1[1], s1[1], 0xb1)),
_mm512_add_epi16(s1[2], _mm512_shuffle_i32x4(s1[2], s1[2], 0xb1)),
_mm512_add_epi16(s1[3], _mm512_shuffle_i32x4(s1[3], s1[3], 0xb1)),
_mm512_add_epi16(s1[4], _mm512_shuffle_i32x4(s1[4], s1[4], 0xb1)),
_mm512_add_epi16(s1[5], _mm512_shuffle_i32x4(s1[5], s1[5], 0xb1)),
_mm512_add_epi16(s1[6], _mm512_shuffle_i32x4(s1[6], s1[6], 0xb1)),
_mm512_add_epi16(s1[7], _mm512_shuffle_i32x4(s1[7], s1[7], 0xb1)),
};
/* Transpose 8x16 matrix into 16x8 matrix */
wu_t l[ 16 ] = {
_mm512_extracti64x4_epi64( d[0], 0 ),
_mm512_extracti64x4_epi64( d[1], 0 ),
_mm512_extracti64x4_epi64( d[2], 0 ),
_mm512_extracti64x4_epi64( d[3], 0 ),
_mm512_extracti64x4_epi64( d[4], 0 ),
_mm512_extracti64x4_epi64( d[5], 0 ),
_mm512_extracti64x4_epi64( d[6], 0 ),
_mm512_extracti64x4_epi64( d[7], 0 ),
_mm512_extracti64x4_epi64( d[0], 1 ),
_mm512_extracti64x4_epi64( d[1], 1 ),
_mm512_extracti64x4_epi64( d[2], 1 ),
_mm512_extracti64x4_epi64( d[3], 1 ),
_mm512_extracti64x4_epi64( d[4], 1 ),
_mm512_extracti64x4_epi64( d[5], 1 ),
_mm512_extracti64x4_epi64( d[6], 1 ),
_mm512_extracti64x4_epi64( d[7], 1 )
};
/* Transpose each 8x8 block */
wu_transpose_8x8( l[0x0], l[0x1], l[0x2], l[0x3], l[0x4], l[0x5], l[0x6], l[0x7],
l[0x0], l[0x1], l[0x2], l[0x3], l[0x4], l[0x5], l[0x6], l[0x7] );
wu_transpose_8x8( l[0x8], l[0x9], l[0xa], l[0xb], l[0xc], l[0xd], l[0xe], l[0xf],
l[0x8], l[0x9], l[0xa], l[0xb], l[0xc], l[0xd], l[0xe], l[0xf] );
/* Reduce-add into d[0] */
l[0x0] = wh_add( l[0x0], l[0x1] ); /* sum(l[0 1]) */
l[0x2] = wh_add( l[0x2], l[0x3] ); /* sum(l[2 3]) */
l[0x4] = wh_add( l[0x4], l[0x5] ); /* sum(l[4 5]) */
l[0x6] = wh_add( l[0x6], l[0x7] ); /* sum(l[6 7]) */
l[0x8] = wh_add( l[0x8], l[0x9] ); /* sum(l[8 9]) */
l[0xa] = wh_add( l[0xa], l[0xb] ); /* sum(l[a b]) */
l[0xc] = wh_add( l[0xc], l[0xd] ); /* sum(l[c d]) */
l[0xe] = wh_add( l[0xe], l[0xf] ); /* sum(l[e f]) */
l[0x0] = wh_add( l[0x0], l[0x2] ); /* sum(l[0 1 2 3]) */
l[0x4] = wh_add( l[0x4], l[0x6] ); /* sum(l[4 5 6 7]) */
l[0x8] = wh_add( l[0x8], l[0xa] ); /* sum(l[8 9 a b]) */
l[0xc] = wh_add( l[0xc], l[0xe] ); /* sum(l[c d e f]) */
l[0x0] = wh_add( l[0x0], l[0x4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
l[0x8] = wh_add( l[0x8], l[0xc] ); /* sum(l[8 9 a b c d e f]) */
l[0x0] = wh_add( l[0x0], l[0x8] ); /* sum(l[0 1 2 3 4 5 6 7 8 9 a b c d e f]) */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment