Created
May 13, 2019 14:13
-
-
Save lambday/5ee397939382ebebab8f22191c909e13 to your computer and use it in GitHub Desktop.
simd test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Type your code here, or load an example. | |
| #include <vector> | |
| #include <algorithm> | |
| #include <numeric> | |
| #include <cimmintrin> | |
| using std::vector; | |
| using std::iota; | |
| using std::accumulate; | |
| vector<float> generate_data() | |
| { | |
| const int size = 10000; | |
| vector<float> v(size); | |
| iota(v.begin(), v.end(), 0.0f); | |
| return v; | |
| } | |
| double test1() | |
| { | |
| vector<float> v = generate_data(); | |
| return accumulate(v.begin(), v.end(), 0.0); | |
| } | |
| double test2() | |
| { | |
| vector<float> v = generate_data(); | |
| return simd_accumulate(v.begin(), v.end(), 0.0); | |
| } | |
| using iterator_type = vector<float>::iterator_type; | |
| auto simd_acucmulate(iterator_type& begin, iterator_type& end, double&& res) | |
| { | |
| int i; | |
| __m256d sum = {res, 0.0, 0.0, 0.0}; //vector to hold partial sums | |
| for (i = 0; i < length; i += 4) { | |
| __m256s va = _mm256_load_ps(&a[i]); | |
| __m256s vb = _mm256_load_ps(&b[i]); | |
| sum = _mm256_add_ps (va, vb, sum); | |
| } | |
| //sum now hold summations of all products in four parts | |
| //we want scalar result | |
| //two options below | |
| #if 1 //Enable this block to perform vector sum with intrinsics | |
| //x86 architecture have little endian data ordering | |
| // index: 0 1 2 3 | |
| //sum contains quad 64bit doubles, say: m, n, p, q | |
| //we want scalar result = m + n + p + q | |
| //intrinsic function to extract upper 128 bits. | |
| //if second parameter is zero then lower 128 bits are extracted. | |
| __m128d xmm = _mm256_extractf128_pd (sum, 1); | |
| //xmm contains: p, q | |
| //This intrinsic is compile time only. | |
| //__m256d ymm = _mm256_zextpd128_pd256 (xmm); //But missing in GCC 5.4.0 | |
| //zero extend xmm to make 256bit vector ymm. | |
| __m256d ymm = {xmm[0], xmm[1], 0, 0}; | |
| //ymm contains: p, q, 0, 0 | |
| //intrinsic function to perform horizontal interleaved addition. | |
| sum = _mm256_hadd_pd (sum, ymm); | |
| //sum contains: m+n, p+q, p+q, 0+0 | |
| //another round of horizontal interleaved addition | |
| sum = _mm256_hadd_pd (sum, sum); | |
| //sum contains: m+n+p+q, m+n+p+q, p+q+0, p+q+0 | |
| return sum[0]; //scalar result = m+n+p+q | |
| #else //vector sum with C arithmetic operators. | |
| double y = 0; | |
| for (i = 0; i < 4; i++) { | |
| y += sum[i]; | |
| } | |
| return y; //scalar result | |
| #endif | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment