lambday/simd.cpp

## simd.cpp
// Type your code here, or load an example.
#include <vector>
#include <algorithm>
#include <numeric>
#include <cimmintrin>

using std::vector;
using std::iota;
using std::accumulate;

vector<float> generate_data()
{
    const int size = 10000;
    vector<float> v(size);
    iota(v.begin(), v.end(), 0.0f);
    return v;
}

double test1()
{
    vector<float> v = generate_data();
    return accumulate(v.begin(), v.end(), 0.0);
}

double test2()
{
    vector<float> v = generate_data();
    return simd_accumulate(v.begin(), v.end(), 0.0);
}

using iterator_type = vector<float>::iterator_type;

auto simd_acucmulate(iterator_type& begin, iterator_type& end, double&& res)
{
    int i;

    __m256d sum = {res, 0.0, 0.0, 0.0}; //vector to hold partial sums
    for (i = 0; i < length; i += 4) {
            __m256s va = _mm256_load_ps(&a[i]);
            __m256s vb = _mm256_load_ps(&b[i]);
            sum = _mm256_add_ps (va, vb, sum);
    }

    //sum now hold summations of all products in four parts
    //we want scalar result
    //two options below

#if 1 //Enable this block to perform vector sum with intrinsics

    //x86 architecture have little endian data ordering

    //                               index: 0  1  2  3
    //sum contains quad 64bit doubles, say: m, n, p, q
    //we want scalar result = m + n + p + q

    //intrinsic function to extract upper 128 bits.
    //if second parameter is zero then lower 128 bits are extracted.
    __m128d xmm = _mm256_extractf128_pd (sum, 1);
    //xmm contains: p, q

    //This intrinsic is compile time only.
    //__m256d ymm = _mm256_zextpd128_pd256 (xmm); //But missing in GCC 5.4.0

    //zero extend xmm to make 256bit vector ymm.
    __m256d ymm = {xmm[0], xmm[1], 0, 0};
    //ymm contains: p, q, 0, 0

    //intrinsic function to perform horizontal interleaved addition.
    sum = _mm256_hadd_pd (sum, ymm);
    //sum contains: m+n, p+q, p+q, 0+0

    //another round of horizontal interleaved addition
    sum = _mm256_hadd_pd (sum, sum);
    //sum contains: m+n+p+q, m+n+p+q, p+q+0, p+q+0

    return sum[0]; //scalar result = m+n+p+q

#else //vector sum with C arithmetic operators.

    double y = 0;

    for (i = 0; i < 4; i++) {
            y += sum[i];
    }

    return y; //scalar result
#endif
}
	// Type your code here, or load an example.
	#include <vector>
	#include <algorithm>
	#include <numeric>
	#include <cimmintrin>

	using std::vector;
	using std::iota;
	using std::accumulate;

	vector<float> generate_data()
	{
	const int size = 10000;
	vector<float> v(size);
	iota(v.begin(), v.end(), 0.0f);
	return v;
	}

	double test1()
	{
	vector<float> v = generate_data();
	return accumulate(v.begin(), v.end(), 0.0);
	}

	double test2()
	{
	vector<float> v = generate_data();
	return simd_accumulate(v.begin(), v.end(), 0.0);
	}

	using iterator_type = vector<float>::iterator_type;

	auto simd_acucmulate(iterator_type& begin, iterator_type& end, double&& res)
	{
	int i;

	__m256d sum = {res, 0.0, 0.0, 0.0}; //vector to hold partial sums
	for (i = 0; i < length; i += 4) {
	__m256s va = _mm256_load_ps(&a[i]);
	__m256s vb = _mm256_load_ps(&b[i]);
	sum = _mm256_add_ps (va, vb, sum);
	}

	//sum now hold summations of all products in four parts
	//we want scalar result
	//two options below

	#if 1 //Enable this block to perform vector sum with intrinsics

	//x86 architecture have little endian data ordering

	// index: 0 1 2 3
	//sum contains quad 64bit doubles, say: m, n, p, q
	//we want scalar result = m + n + p + q

	//intrinsic function to extract upper 128 bits.
	//if second parameter is zero then lower 128 bits are extracted.
	__m128d xmm = _mm256_extractf128_pd (sum, 1);
	//xmm contains: p, q

	//This intrinsic is compile time only.
	//__m256d ymm = _mm256_zextpd128_pd256 (xmm); //But missing in GCC 5.4.0

	//zero extend xmm to make 256bit vector ymm.
	__m256d ymm = {xmm[0], xmm[1], 0, 0};
	//ymm contains: p, q, 0, 0

	//intrinsic function to perform horizontal interleaved addition.
	sum = _mm256_hadd_pd (sum, ymm);
	//sum contains: m+n, p+q, p+q, 0+0

	//another round of horizontal interleaved addition
	sum = _mm256_hadd_pd (sum, sum);
	//sum contains: m+n+p+q, m+n+p+q, p+q+0, p+q+0

	return sum[0]; //scalar result = m+n+p+q

	#else //vector sum with C arithmetic operators.

	double y = 0;

	for (i = 0; i < 4; i++) {
	y += sum[i];
	}

	return y; //scalar result
	#endif
	}
No results found