Skip to content

Instantly share code, notes, and snippets.

@lambday
Created May 13, 2019 14:13
Show Gist options
  • Select an option

  • Save lambday/5ee397939382ebebab8f22191c909e13 to your computer and use it in GitHub Desktop.

Select an option

Save lambday/5ee397939382ebebab8f22191c909e13 to your computer and use it in GitHub Desktop.
simd test
// Type your code here, or load an example.
#include <vector>
#include <algorithm>
#include <numeric>
#include <cimmintrin>
using std::vector;
using std::iota;
using std::accumulate;
vector<float> generate_data()
{
const int size = 10000;
vector<float> v(size);
iota(v.begin(), v.end(), 0.0f);
return v;
}
double test1()
{
vector<float> v = generate_data();
return accumulate(v.begin(), v.end(), 0.0);
}
double test2()
{
vector<float> v = generate_data();
return simd_accumulate(v.begin(), v.end(), 0.0);
}
using iterator_type = vector<float>::iterator_type;
auto simd_acucmulate(iterator_type& begin, iterator_type& end, double&& res)
{
int i;
__m256d sum = {res, 0.0, 0.0, 0.0}; //vector to hold partial sums
for (i = 0; i < length; i += 4) {
__m256s va = _mm256_load_ps(&a[i]);
__m256s vb = _mm256_load_ps(&b[i]);
sum = _mm256_add_ps (va, vb, sum);
}
//sum now hold summations of all products in four parts
//we want scalar result
//two options below
#if 1 //Enable this block to perform vector sum with intrinsics
//x86 architecture have little endian data ordering
// index: 0 1 2 3
//sum contains quad 64bit doubles, say: m, n, p, q
//we want scalar result = m + n + p + q
//intrinsic function to extract upper 128 bits.
//if second parameter is zero then lower 128 bits are extracted.
__m128d xmm = _mm256_extractf128_pd (sum, 1);
//xmm contains: p, q
//This intrinsic is compile time only.
//__m256d ymm = _mm256_zextpd128_pd256 (xmm); //But missing in GCC 5.4.0
//zero extend xmm to make 256bit vector ymm.
__m256d ymm = {xmm[0], xmm[1], 0, 0};
//ymm contains: p, q, 0, 0
//intrinsic function to perform horizontal interleaved addition.
sum = _mm256_hadd_pd (sum, ymm);
//sum contains: m+n, p+q, p+q, 0+0
//another round of horizontal interleaved addition
sum = _mm256_hadd_pd (sum, sum);
//sum contains: m+n+p+q, m+n+p+q, p+q+0, p+q+0
return sum[0]; //scalar result = m+n+p+q
#else //vector sum with C arithmetic operators.
double y = 0;
for (i = 0; i < 4; i++) {
y += sum[i];
}
return y; //scalar result
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment