Last active
November 14, 2025 16:51
-
-
Save 19h/f4d19f6154bd966ae494d4032ccf1a32 to your computer and use it in GitHub Desktop.
ida vs angr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // avx_demo.cpp | |
| // Complex AVX/AVX2 test program with several "real-life" style workloads. | |
| #include <immintrin.h> | |
| #include <chrono> | |
| #include <cstdint> | |
| #include <cmath> | |
| #include <iostream> | |
| #include <vector> | |
| #include <cstring> | |
| #include <string> | |
| #if !defined(__AVX__) | |
| # error "This demo requires AVX support (compile with -mavx)." | |
| #endif | |
| #if !defined(__AVX2__) | |
| # warning "AVX2 not enabled; some integer AVX2 ops are not used in this demo." | |
| #endif | |
| #if !defined(__FMA__) | |
| # warning "FMA not enabled; FMA intrinsics will be emulated by the compiler." | |
| #endif | |
| // Simple portable timer | |
| struct ScopedTimer | |
| { | |
| using clock = std::chrono::high_resolution_clock; | |
| std::string label; | |
| clock::time_point start; | |
| double &out_ms; | |
| ScopedTimer(const std::string &lbl, double &ms_ref) | |
| : label(lbl), start(clock::now()), out_ms(ms_ref) | |
| { | |
| } | |
| ~ScopedTimer() | |
| { | |
| auto end = clock::now(); | |
| auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); | |
| out_ms = static_cast<double>(us) / 1000.0; | |
| } | |
| }; | |
| // Simple deterministic pseudo-random filler (no <random> overhead) | |
| static inline uint32_t lcg_next(uint32_t &state) | |
| { | |
| state = state * 1664525u + 1013904223u; | |
| return state; | |
| } | |
| static void fill_random(std::vector<float> &v, uint32_t seed = 0x12345678u) | |
| { | |
| uint32_t s = seed; | |
| for (float &x : v) | |
| { | |
| uint32_t r = lcg_next(s); | |
| // Map to [-1.0, 1.0] | |
| x = (static_cast<int32_t>(r) / 2147483648.0f); | |
| } | |
| } | |
| // Horizontal sum of a __m256 using SSE fallbacks (per lane) | |
| static inline float hsum256_ps(__m256 v) | |
| { | |
| __m128 vlow = _mm256_castps256_ps128(v); | |
| __m128 vhigh = _mm256_extractf128_ps(v, 1); | |
| vlow = _mm_add_ps(vlow, vhigh); // add the two 128-bit halves | |
| __m128 shuf = _mm_movehdup_ps(vlow); // (v3,v3,v1,v1) | |
| __m128 sums = _mm_add_ps(vlow, shuf); // (v0+v3, v1+v3, v2+v1, v3+v1) | |
| shuf = _mm_movehl_ps(shuf, sums); // ( , , sums3, sums2) | |
| sums = _mm_add_ss(sums, shuf); | |
| return _mm_cvtss_f32(sums); | |
| } | |
| // ============================================================================ | |
| // Workload 1: SAXPY and cosine similarity (vectorized finance/signal-processing) | |
| // ============================================================================ | |
| // y = a * x + y (scalar reference) | |
| static float saxpy_scalar(float a, const float *x, float *y, size_t n) | |
| { | |
| for (size_t i = 0; i < n; ++i) | |
| y[i] = a * x[i] + y[i]; | |
| // checksum | |
| float acc = 0.0f; | |
| for (size_t i = 0; i < n; ++i) | |
| acc += y[i]; | |
| return acc; | |
| } | |
| // y = a * x + y (AVX) | |
| static float saxpy_avx(float a, const float *x, float *y, size_t n) | |
| { | |
| const size_t step = 8; | |
| size_t i = 0; | |
| __m256 av = _mm256_set1_ps(a); | |
| for (; i + step <= n; i += step) | |
| { | |
| __m256 vx = _mm256_loadu_ps(x + i); | |
| __m256 vy = _mm256_loadu_ps(y + i); | |
| __m256 r = _mm256_fmadd_ps(av, vx, vy); // av*vx + vy | |
| _mm256_storeu_ps(y + i, r); | |
| } | |
| // tail | |
| for (; i < n; ++i) | |
| y[i] = a * x[i] + y[i]; | |
| float acc = 0.0f; | |
| for (size_t j = 0; j < n; ++j) | |
| acc += y[j]; | |
| return acc; | |
| } | |
| // Cosine similarity scalar | |
| static float cosine_similarity_scalar(const float *x, const float *y, size_t n) | |
| { | |
| double dot = 0.0; | |
| double nx = 0.0; | |
| double ny = 0.0; | |
| for (size_t i = 0; i < n; ++i) | |
| { | |
| double xi = x[i]; | |
| double yi = y[i]; | |
| dot += xi * yi; | |
| nx += xi * xi; | |
| ny += yi * yi; | |
| } | |
| double denom = std::sqrt(nx * ny); | |
| if (denom == 0.0) | |
| return 0.0f; | |
| return static_cast<float>(dot / denom); | |
| } | |
| // Cosine similarity AVX | |
| static float cosine_similarity_avx(const float *x, const float *y, size_t n) | |
| { | |
| const size_t step = 8; | |
| size_t i = 0; | |
| __m256 dotv = _mm256_setzero_ps(); | |
| __m256 nxv = _mm256_setzero_ps(); | |
| __m256 nyv = _mm256_setzero_ps(); | |
| for (; i + step <= n; i += step) | |
| { | |
| __m256 vx = _mm256_loadu_ps(x + i); | |
| __m256 vy = _mm256_loadu_ps(y + i); | |
| __m256 prod = _mm256_mul_ps(vx, vy); | |
| dotv = _mm256_add_ps(dotv, prod); | |
| nxv = _mm256_fmadd_ps(vx, vx, nxv); | |
| nyv = _mm256_fmadd_ps(vy, vy, nyv); | |
| } | |
| float dot = hsum256_ps(dotv); | |
| float nx = hsum256_ps(nxv); | |
| float ny = hsum256_ps(nyv); | |
| for (; i < n; ++i) | |
| { | |
| float xi = x[i]; | |
| float yi = y[i]; | |
| dot += xi * yi; | |
| nx += xi * xi; | |
| ny += yi * yi; | |
| } | |
| double denom = std::sqrt(static_cast<double>(nx) * static_cast<double>(ny)); | |
| if (denom == 0.0) | |
| return 0.0f; | |
| return static_cast<float>(dot / denom); | |
| } | |
| // ============================================================================ | |
| // Workload 2: 2D image blur on float "image" (e.g., 1080p grayscale) | |
| // new[y,x] = 0.2 * (center + left + right + up + down) | |
| // ============================================================================ | |
| struct ImageF | |
| { | |
| int w; | |
| int h; | |
| std::vector<float> data; | |
| ImageF(int width, int height) | |
| : w(width), h(height), data(static_cast<size_t>(width) * height) | |
| { | |
| } | |
| float *row(int y) { return data.data() + static_cast<size_t>(y) * w; } | |
| const float *row(int y) const { return data.data() + static_cast<size_t>(y) * w; } | |
| }; | |
| // Scalar blur | |
| static float blur5_scalar(const ImageF &src, ImageF &dst) | |
| { | |
| const int w = src.w; | |
| const int h = src.h; | |
| // Copy border as-is | |
| std::memcpy(dst.row(0), src.row(0), sizeof(float) * w); | |
| std::memcpy(dst.row(h - 1), src.row(h - 1), sizeof(float) * w); | |
| for (int y = 1; y < h - 1; ++y) | |
| { | |
| dst.row(y)[0] = src.row(y)[0]; | |
| dst.row(y)[w - 1] = src.row(y)[w - 1]; | |
| } | |
| float checksum = 0.0f; | |
| for (int y = 1; y < h - 1; ++y) | |
| { | |
| const float *row_c = src.row(y); | |
| const float *row_u = src.row(y - 1); | |
| const float *row_d = src.row(y + 1); | |
| float *row_o = dst.row(y); | |
| for (int x = 1; x < w - 1; ++x) | |
| { | |
| float val = row_c[x] | |
| + row_c[x - 1] | |
| + row_c[x + 1] | |
| + row_u[x] | |
| + row_d[x]; | |
| val *= 0.2f; | |
| row_o[x] = val; | |
| checksum += val; | |
| } | |
| } | |
| return checksum; | |
| } | |
| // AVX blur: processes interior pixels in 8-wide blocks | |
| static float blur5_avx(const ImageF &src, ImageF &dst) | |
| { | |
| const int w = src.w; | |
| const int h = src.h; | |
| std::memcpy(dst.row(0), src.row(0), sizeof(float) * w); | |
| std::memcpy(dst.row(h - 1), src.row(h - 1), sizeof(float) * w); | |
| for (int y = 1; y < h - 1; ++y) | |
| { | |
| dst.row(y)[0] = src.row(y)[0]; | |
| dst.row(y)[w - 1] = src.row(y)[w - 1]; | |
| } | |
| const __m256 scale = _mm256_set1_ps(0.2f); | |
| float checksum = 0.0f; | |
| for (int y = 1; y < h - 1; ++y) | |
| { | |
| const float *row_c = src.row(y); | |
| const float *row_u = src.row(y - 1); | |
| const float *row_d = src.row(y + 1); | |
| float *row_o = dst.row(y); | |
| int x = 1; | |
| const int max_x = w - 1; | |
| const int vec_end = max_x - 8 + 1; // last x for a full 8-wide block | |
| for (; x <= vec_end; x += 8) | |
| { | |
| __m256 vc = _mm256_loadu_ps(row_c + x); | |
| __m256 vl = _mm256_loadu_ps(row_c + x - 1); | |
| __m256 vr = _mm256_loadu_ps(row_c + x + 1); | |
| __m256 vu = _mm256_loadu_ps(row_u + x); | |
| __m256 vd = _mm256_loadu_ps(row_d + x); | |
| __m256 sum = _mm256_add_ps(vc, vl); | |
| sum = _mm256_add_ps(sum, vr); | |
| sum = _mm256_add_ps(sum, vu); | |
| sum = _mm256_add_ps(sum, vd); | |
| sum = _mm256_mul_ps(sum, scale); | |
| _mm256_storeu_ps(row_o + x, sum); | |
| checksum += hsum256_ps(sum); | |
| } | |
| // tail in this row | |
| for (; x < max_x; ++x) | |
| { | |
| float val = row_c[x] | |
| + row_c[x - 1] | |
| + row_c[x + 1] | |
| + row_u[x] | |
| + row_d[x]; | |
| val *= 0.2f; | |
| row_o[x] = val; | |
| checksum += val; | |
| } | |
| } | |
| return checksum; | |
| } | |
| // ============================================================================ | |
| // Workload 3: Complex multiply + FIR convolution with AVX/FMA | |
| // Complex data in SoA form (real[] and imag[] arrays) | |
| // ============================================================================ | |
| struct ComplexSoA | |
| { | |
| std::vector<float> re; | |
| std::vector<float> im; | |
| ComplexSoA(size_t n = 0) | |
| : re(n), im(n) | |
| { | |
| } | |
| void resize(size_t n) | |
| { | |
| re.resize(n); | |
| im.resize(n); | |
| } | |
| size_t size() const { return re.size(); } | |
| }; | |
| static void fill_complex(ComplexSoA &c, uint32_t seed = 0xCAFEBABEu) | |
| { | |
| uint32_t s = seed; | |
| for (size_t i = 0; i < c.size(); ++i) | |
| { | |
| uint32_t r1 = lcg_next(s); | |
| uint32_t r2 = lcg_next(s); | |
| c.re[i] = (static_cast<int32_t>(r1) / 2147483648.0f); | |
| c.im[i] = (static_cast<int32_t>(r2) / 2147483648.0f); | |
| } | |
| } | |
| // Complex multiply scalar: out = a * b | |
| static float complex_mul_scalar(const ComplexSoA &a, const ComplexSoA &b, ComplexSoA &out) | |
| { | |
| const size_t n = a.size(); | |
| out.resize(n); | |
| float checksum = 0.0f; | |
| for (size_t i = 0; i < n; ++i) | |
| { | |
| float ar = a.re[i]; | |
| float ai = a.im[i]; | |
| float br = b.re[i]; | |
| float bi = b.im[i]; | |
| float zr = ar * br - ai * bi; | |
| float zi = ar * bi + ai * br; | |
| out.re[i] = zr; | |
| out.im[i] = zi; | |
| checksum += zr * 0.5f + zi * 0.25f; | |
| } | |
| return checksum; | |
| } | |
| // Complex multiply AVX/FMA: out = a * b (SoA) | |
| static float complex_mul_avx(const ComplexSoA &a, const ComplexSoA &b, ComplexSoA &out) | |
| { | |
| const size_t n = a.size(); | |
| out.resize(n); | |
| const size_t step = 8; | |
| size_t i = 0; | |
| float checksum = 0.0f; | |
| for (; i + step <= n; i += step) | |
| { | |
| __m256 ar = _mm256_loadu_ps(a.re.data() + i); | |
| __m256 ai = _mm256_loadu_ps(a.im.data() + i); | |
| __m256 br = _mm256_loadu_ps(b.re.data() + i); | |
| __m256 bi = _mm256_loadu_ps(b.im.data() + i); | |
| // zr = ar*br - ai*bi | |
| __m256 zr = _mm256_fmsub_ps(ar, br, _mm256_mul_ps(ai, bi)); | |
| // zi = ar*bi + ai*br | |
| __m256 zi = _mm256_fmadd_ps(ar, bi, _mm256_mul_ps(ai, br)); | |
| _mm256_storeu_ps(out.re.data() + i, zr); | |
| _mm256_storeu_ps(out.im.data() + i, zi); | |
| // simple checksum: linear combination | |
| __m256 c1 = _mm256_set1_ps(0.5f); | |
| __m256 c2 = _mm256_set1_ps(0.25f); | |
| __m256 tmp = _mm256_add_ps(_mm256_mul_ps(zr, c1), _mm256_mul_ps(zi, c2)); | |
| checksum += hsum256_ps(tmp); | |
| } | |
| for (; i < n; ++i) | |
| { | |
| float ar = a.re[i]; | |
| float ai = a.im[i]; | |
| float br = b.re[i]; | |
| float bi = b.im[i]; | |
| float zr = ar * br - ai * bi; | |
| float zi = ar * bi + ai * br; | |
| out.re[i] = zr; | |
| out.im[i] = zi; | |
| checksum += zr * 0.5f + zi * 0.25f; | |
| } | |
| return checksum; | |
| } | |
| // FIR convolution (scalar) on ComplexSoA: | |
| // y[k] = sum_{i=0..L-1} h[i] * x[k-i], real-valued taps h. | |
| static float complex_fir_scalar(const ComplexSoA &x, const std::vector<float> &h, ComplexSoA &y) | |
| { | |
| const size_t n = x.size(); | |
| const size_t L = h.size(); | |
| y.resize(n); | |
| float checksum = 0.0f; | |
| for (size_t k = 0; k < n; ++k) | |
| { | |
| float acc_re = 0.0f; | |
| float acc_im = 0.0f; | |
| for (size_t i = 0; i < L; ++i) | |
| { | |
| if (k < i) | |
| break; | |
| float tap = h[i]; | |
| size_t idx = k - i; | |
| acc_re += tap * x.re[idx]; | |
| acc_im += tap * x.im[idx]; | |
| } | |
| y.re[k] = acc_re; | |
| y.im[k] = acc_im; | |
| checksum += acc_re * 0.75f + acc_im * 0.33f; | |
| } | |
| return checksum; | |
| } | |
| // FIR convolution AVX on ComplexSoA with real taps, unrolled over taps | |
| // Uses AVX for inner products over 8 samples at a time. | |
| static float complex_fir_avx(const ComplexSoA &x, const std::vector<float> &h, ComplexSoA &y) | |
| { | |
| const size_t n = x.size(); | |
| const size_t L = h.size(); | |
| y.resize(n); | |
| float checksum = 0.0f; | |
| for (size_t k = 0; k < n; ++k) | |
| { | |
| __m256 acc_re_vec = _mm256_setzero_ps(); | |
| __m256 acc_im_vec = _mm256_setzero_ps(); | |
| size_t i = 0; | |
| // Vectorized over taps in chunks of 8, but respecting bounds k-i >= 0. | |
| for (; i + 8 <= L; i += 8) | |
| { | |
| if (k + 1 < i + 8) | |
| break; // would underflow indexes | |
| // taps h[i..i+7] | |
| __m256 ht = _mm256_loadu_ps(h.data() + i); | |
| // indices x[k-i], reversed window: | |
| // idx0 = k-i | |
| // idx1 = k-(i+1) | |
| // ... | |
| // This reverse pattern is not contiguous, so here we use a simplified | |
| // strategy: approximate by convolving over forward indexes when possible. | |
| // For a realistic case you'd pre-reverse h or x into a contiguous buffer. | |
| size_t base = k - i - 7; | |
| if (base + 8 > n) | |
| continue; | |
| __m256 xr = _mm256_loadu_ps(x.re.data() + base); | |
| __m256 xi = _mm256_loadu_ps(x.im.data() + base); | |
| acc_re_vec = _mm256_fmadd_ps(xr, ht, acc_re_vec); | |
| acc_im_vec = _mm256_fmadd_ps(xi, ht, acc_im_vec); | |
| } | |
| float acc_re = hsum256_ps(acc_re_vec); | |
| float acc_im = hsum256_ps(acc_im_vec); | |
| // scalar remainder over taps (including underflow-safe region) | |
| for (; i < L; ++i) | |
| { | |
| if (k < i) | |
| break; | |
| float tap = h[i]; | |
| size_t idx = k - i; | |
| acc_re += tap * x.re[idx]; | |
| acc_im += tap * x.im[idx]; | |
| } | |
| y.re[k] = acc_re; | |
| y.im[k] = acc_im; | |
| checksum += acc_re * 0.75f + acc_im * 0.33f; | |
| } | |
| return checksum; | |
| } | |
| // ============================================================================ | |
| // Utility: soft clip / limiter using AVX compare + blend | |
| // y = clip(x, -threshold, +threshold) | |
| // ============================================================================ | |
| static float soft_clip_scalar(float *x, size_t n, float threshold) | |
| { | |
| float sum = 0.0f; | |
| for (size_t i = 0; i < n; ++i) | |
| { | |
| float v = x[i]; | |
| if (v > threshold) | |
| v = threshold; | |
| else if (v < -threshold) | |
| v = -threshold; | |
| x[i] = v; | |
| sum += v; | |
| } | |
| return sum; | |
| } | |
| static float soft_clip_avx(float *x, size_t n, float threshold) | |
| { | |
| const size_t step = 8; | |
| size_t i = 0; | |
| __m256 th = _mm256_set1_ps(threshold); | |
| __m256 nth = _mm256_set1_ps(-threshold); | |
| float sum = 0.0f; | |
| for (; i + step <= n; i += step) | |
| { | |
| __m256 v = _mm256_loadu_ps(x + i); | |
| // clamp high: v = min(v, th) | |
| __m256 v_hi = _mm256_min_ps(v, th); | |
| // clamp low: v = max(v_hi, -th) | |
| __m256 v_clamped = _mm256_max_ps(v_hi, nth); | |
| _mm256_storeu_ps(x + i, v_clamped); | |
| sum += hsum256_ps(v_clamped); | |
| } | |
| for (; i < n; ++i) | |
| { | |
| float v = x[i]; | |
| if (v > threshold) | |
| v = threshold; | |
| else if (v < -threshold) | |
| v = -threshold; | |
| x[i] = v; | |
| sum += v; | |
| } | |
| return sum; | |
| } | |
| // ============================================================================ | |
| // Main | |
| // ============================================================================ | |
| int main() | |
| { | |
| // 1) Vector workloads: SAXPY + cosine similarity | |
| const size_t N_vec = 1u << 22; // 4M floats (~16 MiB per vector) | |
| std::vector<float> x(N_vec), y(N_vec), y2(N_vec), z(N_vec), z2(N_vec); | |
| fill_random(x, 0x11111111u); | |
| fill_random(y, 0x22222222u); | |
| fill_random(z, 0x33333333u); | |
| z2 = z; // copy for AVX path | |
| std::cout << "=== Workload 1: SAXPY + cosine similarity ===\n"; | |
| double ms_saxpy_scalar = 0.0, ms_saxpy_avx = 0.0; | |
| float saxpy_cs_scalar = 0.0f, saxpy_cs_avx = 0.0f; | |
| { | |
| y2 = y; | |
| ScopedTimer t("saxpy_scalar", ms_saxpy_scalar); | |
| saxpy_cs_scalar = saxpy_scalar(1.2345f, x.data(), y2.data(), N_vec); | |
| } | |
| { | |
| y2 = y; | |
| ScopedTimer t("saxpy_avx", ms_saxpy_avx); | |
| saxpy_cs_avx = saxpy_avx(1.2345f, x.data(), y2.data(), N_vec); | |
| } | |
| double ms_cos_scalar = 0.0, ms_cos_avx = 0.0; | |
| float cos_scalar = 0.0f, cos_avx = 0.0f; | |
| { | |
| ScopedTimer t("cosine_scalar", ms_cos_scalar); | |
| cos_scalar = cosine_similarity_scalar(x.data(), z.data(), N_vec); | |
| } | |
| { | |
| ScopedTimer t("cosine_avx", ms_cos_avx); | |
| cos_avx = cosine_similarity_avx(x.data(), z2.data(), N_vec); | |
| } | |
| std::cout << "SAXPY scalar: checksum=" << saxpy_cs_scalar << " time=" << ms_saxpy_scalar << " ms\n"; | |
| std::cout << "SAXPY AVX : checksum=" << saxpy_cs_avx << " time=" << ms_saxpy_avx << " ms\n"; | |
| std::cout << "Cosine scalar: value=" << cos_scalar << " time=" << ms_cos_scalar << " ms\n"; | |
| std::cout << "Cosine AVX : value=" << cos_avx << " time=" << ms_cos_avx << " ms\n"; | |
| std::cout << "--------------------------------------------------------\n\n"; | |
| // 2) Image blur (1080p) | |
| const int W = 1920; | |
| const int H = 1080; | |
| ImageF img(W, H); | |
| ImageF blur_ref(W, H); | |
| ImageF blur_avx(W, H); | |
| fill_random(img.data, 0xA5A5A5A5u); | |
| std::cout << "=== Workload 2: 2D 5-point blur on 1080p image ===\n"; | |
| double ms_blur_scalar = 0.0, ms_blur_avx = 0.0; | |
| float cs_blur_scalar = 0.0f, cs_blur_avx = 0.0f; | |
| { | |
| ScopedTimer t("blur_scalar", ms_blur_scalar); | |
| cs_blur_scalar = blur5_scalar(img, blur_ref); | |
| } | |
| { | |
| ScopedTimer t("blur_avx", ms_blur_avx); | |
| cs_blur_avx = blur5_avx(img, blur_avx); | |
| } | |
| std::cout << "Blur scalar: checksum=" << cs_blur_scalar << " time=" << ms_blur_scalar << " ms\n"; | |
| std::cout << "Blur AVX : checksum=" << cs_blur_avx << " time=" << ms_blur_avx << " ms\n"; | |
| // Quick consistency check: difference in checksum | |
| std::cout << "Checksum delta (AVX - scalar): " | |
| << (cs_blur_avx - cs_blur_scalar) << "\n"; | |
| std::cout << "--------------------------------------------------------\n\n"; | |
| // 3) Complex workloads | |
| const size_t N_cplx = 1u << 18; // 262,144 complex samples | |
| ComplexSoA a(N_cplx), b(N_cplx), c_ref, c_avx, fir_ref, fir_avx; | |
| fill_complex(a, 0x1234ABCDu); | |
| fill_complex(b, 0x9876FEDCu); | |
| // FIR taps (e.g. 16-tap low-pass prototype) | |
| const size_t L = 16; | |
| std::vector<float> taps(L); | |
| for (size_t i = 0; i < L; ++i) | |
| { | |
| // simple symmetric shape | |
| float x_rel = (static_cast<float>(i) - (L - 1) / 2.0f) / (L / 2.0f); | |
| float win = 0.5f - 0.5f * std::cos(3.14159265358979323846f * (i + 0.5f) / L); | |
| taps[i] = win * std::exp(-x_rel * x_rel); | |
| } | |
| std::cout << "=== Workload 3: Complex multiply + FIR convolution ===\n"; | |
| double ms_cmul_scalar = 0.0, ms_cmul_avx = 0.0; | |
| float cs_cmul_scalar = 0.0f, cs_cmul_avx = 0.0f; | |
| { | |
| ScopedTimer t("complex_mul_scalar", ms_cmul_scalar); | |
| cs_cmul_scalar = complex_mul_scalar(a, b, c_ref); | |
| } | |
| { | |
| ScopedTimer t("complex_mul_avx", ms_cmul_avx); | |
| cs_cmul_avx = complex_mul_avx(a, b, c_avx); | |
| } | |
| double ms_fir_scalar = 0.0, ms_fir_avx = 0.0; | |
| float cs_fir_scalar = 0.0f, cs_fir_avx = 0.0f; | |
| { | |
| ScopedTimer t("complex_fir_scalar", ms_fir_scalar); | |
| cs_fir_scalar = complex_fir_scalar(a, taps, fir_ref); | |
| } | |
| { | |
| ScopedTimer t("complex_fir_avx", ms_fir_avx); | |
| cs_fir_avx = complex_fir_avx(a, taps, fir_avx); | |
| } | |
| std::cout << "Complex mul scalar: checksum=" << cs_cmul_scalar << " time=" << ms_cmul_scalar << " ms\n"; | |
| std::cout << "Complex mul AVX : checksum=" << cs_cmul_avx << " time=" << ms_cmul_avx << " ms\n"; | |
| std::cout << "FIR scalar : checksum=" << cs_fir_scalar << " time=" << ms_fir_scalar << " ms\n"; | |
| std::cout << "FIR AVX : checksum=" << cs_fir_avx << " time=" << ms_fir_avx << " ms\n"; | |
| std::cout << "Delta cmul checksum (AVX - scalar): " << (cs_cmul_avx - cs_cmul_scalar) << "\n"; | |
| std::cout << "Delta FIR checksum (AVX - scalar): " << (cs_fir_avx - cs_fir_scalar) << "\n"; | |
| std::cout << "--------------------------------------------------------\n\n"; | |
| // 4) Soft clipping on FIR output (just to exercise AVX clamp / min / max) | |
| std::cout << "=== Workload 4: Soft clip / limiter on FIR output ===\n"; | |
| double ms_clip_scalar = 0.0, ms_clip_avx = 0.0; | |
| float cs_clip_scalar = 0.0f, cs_clip_avx = 0.0f; | |
| // Pack FIR real part into separate working buffer | |
| std::vector<float> fir_real = fir_ref.re; | |
| std::vector<float> fir_real2 = fir_real; | |
| { | |
| ScopedTimer t("soft_clip_scalar", ms_clip_scalar); | |
| cs_clip_scalar = soft_clip_scalar(fir_real.data(), fir_real.size(), 0.8f); | |
| } | |
| { | |
| ScopedTimer t("soft_clip_avx", ms_clip_avx); | |
| cs_clip_avx = soft_clip_avx(fir_real2.data(), fir_real2.size(), 0.8f); | |
| } | |
| std::cout << "Soft clip scalar: checksum=" << cs_clip_scalar << " time=" << ms_clip_scalar << " ms\n"; | |
| std::cout << "Soft clip AVX : checksum=" << cs_clip_avx << " time=" << ms_clip_avx << " ms\n"; | |
| std::cout << "Delta clip checksum (AVX - scalar): " | |
| << (cs_clip_avx - cs_clip_scalar) << "\n"; | |
| std::cout << "\nDone.\n"; | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| int __fastcall main(int argc, const char **argv, const char **envp) | |
| { | |
| __int64 v9; // rdx | |
| __int64 v10; // rdx | |
| __int64 v11; // rdx | |
| __int64 v12; // rdx | |
| __int64 v14; // rcx | |
| int v17; // eax | |
| __int64 v20; // rcx | |
| int v21; // eax | |
| __int64 v24; // rcx | |
| int v26; // eax | |
| __int64 v34; // rdx | |
| __int64 v50; // rdx | |
| __int64 v62; // rax | |
| double v110; // xmm4_8 | |
| __int64 v113; // rbx | |
| double v114; // xmm4_8 | |
| __int64 v116; // rax | |
| double v117; // xmm4_8 | |
| __int64 v120; // rbx | |
| double v121; // xmm4_8 | |
| __int64 v123; // rax | |
| double v124; // xmm4_8 | |
| __int64 v127; // rbx | |
| double v128; // xmm4_8 | |
| __int64 v130; // rax | |
| double v131; // xmm4_8 | |
| __int64 v134; // rbx | |
| double v135; // xmm4_8 | |
| __int64 v137; // rax | |
| __int64 v138; // rdx | |
| __int64 v139; // rdx | |
| __int64 v140; // rdx | |
| char *v142; // rcx | |
| int v144; // eax | |
| int v146; // ebx | |
| size_t v147; // r14 | |
| int v148; // ecx | |
| __int64 v149; // rbx | |
| char *v150; // r12 | |
| int v151; // r15d | |
| int v153; // ecx | |
| __int64 v157; // r15 | |
| unsigned int v158; // r11d | |
| char *v160; // rsi | |
| char *v161; // rdx | |
| size_t v162; // r12 | |
| __int64 v163; // r14 | |
| int v187; // r9d | |
| __int64 v188; // rdi | |
| int v189; // r10d | |
| signed int v202; // edi | |
| int v218; // edi | |
| __int64 v235; // r15 | |
| int v236; // ecx | |
| __int64 v240; // rbx | |
| __int64 v243; // r15 | |
| __int64 v244; // r9 | |
| int v260; // r11d | |
| int v262; // r10d | |
| int v265; // edi | |
| __int64 v278; // rdi | |
| int v279; // r10d | |
| int v280; // r9d | |
| double v305; // xmm4_8 | |
| __int64 v307; // rbx | |
| double v308; // xmm4_8 | |
| __int64 v310; // rax | |
| double v311; // xmm4_8 | |
| __int64 v313; // rbx | |
| double v314; // xmm4_8 | |
| __int64 v316; // rax | |
| double v317; // xmm4_8 | |
| __int64 v321; // rax | |
| __int64 v322; // rdx | |
| __int64 v340; // r15 | |
| unsigned __int64 v341; // r13 | |
| unsigned __int64 v370; // r9 | |
| __int64 v443; // rcx | |
| unsigned __int64 v444; // r12 | |
| bool v445; // cf | |
| bool v446; // zf | |
| bool v448; // r11 | |
| unsigned __int64 v449; // r14 | |
| unsigned __int64 v455; // r11 | |
| unsigned __int64 v456; // rdx | |
| unsigned __int64 v520; // r15 | |
| unsigned __int64 v521; // r14 | |
| __int64 v526; // rdx | |
| unsigned __int64 v533; // rdi | |
| __int64 v557; // rdx | |
| double v559; // xmm4_8 | |
| __int64 v561; // rbx | |
| double v562; // xmm4_8 | |
| __int64 v564; // rax | |
| double v565; // xmm4_8 | |
| __int64 v567; // rbx | |
| double v568; // xmm4_8 | |
| __int64 v570; // rax | |
| double v571; // xmm4_8 | |
| __int64 v573; // rbx | |
| double v574; // xmm4_8 | |
| __int64 v576; // rax | |
| double v577; // xmm4_8 | |
| __int64 v579; // rbx | |
| double v580; // xmm4_8 | |
| __int64 v582; // rax | |
| double v583; // xmm4_8 | |
| __int64 v587; // rax | |
| double v588; // xmm4_8 | |
| __int64 v592; // rax | |
| __int64 v594; // rsi | |
| unsigned __int64 v595; // rcx | |
| unsigned __int64 v596; // rdi | |
| __int64 v597; // rax | |
| unsigned __int64 v598; // rdx | |
| __int64 v603; // rcx | |
| unsigned __int64 v623; // rcx | |
| unsigned __int64 v624; // rax | |
| unsigned __int64 v625; // rdx | |
| bool v626; // cc | |
| __int64 v642; // rax | |
| unsigned __int64 v647; // rcx | |
| unsigned __int64 v652; // rsi | |
| unsigned __int64 v665; // rdx | |
| unsigned __int64 v681; // rdi | |
| unsigned __int64 v685; // r8 | |
| unsigned __int64 v691; // rdx | |
| bool v692; // cc | |
| int v698; // edx | |
| __int64 v709; // rdx | |
| double v716; // xmm4_8 | |
| __int64 v718; // rbx | |
| double v719; // xmm4_8 | |
| __int64 v721; // rax | |
| double v722; // xmm4_8 | |
| __int64 v724; // rbx | |
| double v725; // xmm4_8 | |
| __int64 v727; // rax | |
| double v728; // xmm4_8 | |
| __int64 v732; // rax | |
| int v854; // ecx | |
| int v860; // esi | |
| char *v864; // [rsp-490h] [rbp-490h] | |
| __int64 v865; // [rsp-488h] [rbp-488h] | |
| unsigned int v866; // [rsp-450h] [rbp-450h] | |
| int v867; // [rsp-44Ch] [rbp-44Ch] | |
| int v868; // [rsp-438h] [rbp-438h] | |
| __int64 v869; // [rsp-438h] [rbp-438h] | |
| unsigned __int64 v870; // [rsp-438h] [rbp-438h] | |
| char *v871; // [rsp-430h] [rbp-430h] | |
| int v872; // [rsp-430h] [rbp-430h] | |
| __int64 v873; // [rsp-428h] [rbp-428h] | |
| __int64 v874; // [rsp-428h] [rbp-428h] | |
| __int64 v875; // [rsp-420h] [rbp-420h] | |
| __int64 v876; // [rsp-420h] [rbp-420h] | |
| int v877; // [rsp-418h] [rbp-418h] | |
| __int64 v878; // [rsp-418h] [rbp-418h] | |
| __int64 v879; // [rsp-410h] [rbp-410h] | |
| int v880; // [rsp-408h] [rbp-408h] | |
| unsigned __int64 v881; // [rsp-408h] [rbp-408h] | |
| __int64 v882; // [rsp-400h] [rbp-400h] | |
| int v883; // [rsp-3F0h] [rbp-3F0h] | |
| unsigned int v884; // [rsp-3E8h] [rbp-3E8h] | |
| __int64 v885; // [rsp-3E0h] [rbp-3E0h] | |
| int v886; // [rsp-3E0h] [rbp-3E0h] | |
| __int64 v887; // [rsp-3D0h] [rbp-3D0h] | |
| int v888; // [rsp-3C8h] [rbp-3C8h] | |
| __int64 v889; // [rsp-3C0h] [rbp-3C0h] | |
| char *v890; // [rsp-3C0h] [rbp-3C0h] | |
| __int64 v891; // [rsp-3C0h] [rbp-3C0h] | |
| char *v892; // [rsp-3B8h] [rbp-3B8h] | |
| __int64 v893; // [rsp-3B8h] [rbp-3B8h] | |
| __int64 v894; // [rsp-3B8h] [rbp-3B8h] | |
| int v895; // [rsp-3B0h] [rbp-3B0h] | |
| __int64 v896; // [rsp-3B0h] [rbp-3B0h] | |
| __int64 v897; // [rsp-3B0h] [rbp-3B0h] | |
| __int64 v898; // [rsp-3A8h] [rbp-3A8h] | |
| __int64 v899; // [rsp-3A0h] [rbp-3A0h] | |
| unsigned __int64 v900; // [rsp-398h] [rbp-398h] | |
| __int64 v901; // [rsp-390h] [rbp-390h] | |
| __int64 v902; // [rsp-380h] [rbp-380h] BYREF | |
| __int64 v903; // [rsp-378h] [rbp-378h] BYREF | |
| __int64 v904; // [rsp-370h] [rbp-370h] BYREF | |
| __int64 v905; // [rsp-368h] [rbp-368h] BYREF | |
| __int64 v906; // [rsp-360h] [rbp-360h] BYREF | |
| __int64 v907; // [rsp-358h] [rbp-358h] BYREF | |
| __int64 v908; // [rsp-350h] [rbp-350h] BYREF | |
| __int64 v909; // [rsp-348h] [rbp-348h] BYREF | |
| __int64 v910; // [rsp-340h] [rbp-340h] BYREF | |
| __int64 v911; // [rsp-338h] [rbp-338h] BYREF | |
| __int64 v912; // [rsp-330h] [rbp-330h] BYREF | |
| __int64 v913; // [rsp-328h] [rbp-328h] BYREF | |
| __int64 v914; // [rsp-320h] [rbp-320h] BYREF | |
| __int64 v915; // [rsp-318h] [rbp-318h] | |
| __int64 v916; // [rsp-300h] [rbp-300h] BYREF | |
| __int64 v917; // [rsp-2F8h] [rbp-2F8h] | |
| _QWORD v918[4]; // [rsp-2E0h] [rbp-2E0h] BYREF | |
| __int64 v919; // [rsp-2C0h] [rbp-2C0h] BYREF | |
| __int64 v920; // [rsp-2B8h] [rbp-2B8h] | |
| _QWORD v921[4]; // [rsp-2A0h] [rbp-2A0h] BYREF | |
| __int64 v922; // [rsp-280h] [rbp-280h] BYREF | |
| __int64 v923; // [rsp-278h] [rbp-278h] | |
| __int64 v924; // [rsp-260h] [rbp-260h] BYREF | |
| __int64 v925; // [rsp-258h] [rbp-258h] | |
| __int64 v926; // [rsp-240h] [rbp-240h] BYREF | |
| __int64 v927; // [rsp-238h] [rbp-238h] | |
| __int64 v928; // [rsp-220h] [rbp-220h] | |
| char *v929; // [rsp-218h] [rbp-218h] BYREF | |
| char *v930; // [rsp-210h] [rbp-210h] | |
| __int64 v931; // [rsp-200h] [rbp-200h] | |
| void *v932[3]; // [rsp-1F8h] [rbp-1F8h] BYREF | |
| __int64 v933; // [rsp-1E0h] [rbp-1E0h] | |
| void *v934[3]; // [rsp-1D8h] [rbp-1D8h] BYREF | |
| __int64 v935; // [rsp-1C0h] [rbp-1C0h] BYREF | |
| __int64 v936; // [rsp-1B8h] [rbp-1B8h] | |
| __int64 v937; // [rsp-1A8h] [rbp-1A8h] BYREF | |
| _QWORD v938[3]; // [rsp-190h] [rbp-190h] BYREF | |
| __int64 v939; // [rsp-178h] [rbp-178h] BYREF | |
| _QWORD v940[3]; // [rsp-160h] [rbp-160h] BYREF | |
| _QWORD v941[3]; // [rsp-148h] [rbp-148h] BYREF | |
| _QWORD v942[3]; // [rsp-130h] [rbp-130h] BYREF | |
| _QWORD v943[3]; // [rsp-118h] [rbp-118h] BYREF | |
| _QWORD v944[3]; // [rsp-100h] [rbp-100h] BYREF | |
| _QWORD v945[3]; // [rsp-E8h] [rbp-E8h] BYREF | |
| _QWORD v946[3]; // [rsp-D0h] [rbp-D0h] BYREF | |
| _QWORD v947[3]; // [rsp-B8h] [rbp-B8h] BYREF | |
| _BYTE v948[32]; // [rsp-A0h] [rbp-A0h] BYREF | |
| _QWORD v949[14]; // [rsp-80h] [rbp-80h] BYREF | |
| _QWORD v950[2]; // [rsp-10h] [rbp-10h] BYREF | |
| _UNKNOWN *retaddr; // [rsp+0h] [rbp+0h] | |
| char v952; // [rsp+8h] [rbp+8h] BYREF | |
| v950[1] = retaddr; | |
| _RBP = v950; | |
| v949[9] = &v952; | |
| v949[7] = __readfsqword(0x28u); | |
| std::vector<float>::vector(&v914, 0x400000, envp); | |
| std::vector<float>::vector(&v916, 0x400000, v9); | |
| std::vector<float>::vector(v918, 0x400000, v10); | |
| std::vector<float>::vector(&v919, 0x400000, v11); | |
| std::vector<float>::vector(v921, 0x400000, v12); | |
| _RBX = v914; | |
| v14 = v915; | |
| if ( v915 != v914 ) | |
| { | |
| __asm { vmovss xmm1, cs:dword_6004 } | |
| _RDX = v914; | |
| v17 = 286331153; | |
| do | |
| { | |
| __asm { vxorps xmm3, xmm3, xmm3 } | |
| _RDX += 4; | |
| v17 = 1664525 * v17 + 1013904223; | |
| __asm | |
| { | |
| vcvtsi2ss xmm0, xmm3, eax | |
| vmulss xmm0, xmm0, xmm1 | |
| vmovss dword ptr [rdx-4], xmm0 | |
| } | |
| } | |
| while ( v14 != _RDX ); | |
| } | |
| _RDX = v916; | |
| v20 = v917; | |
| v21 = 572662306; | |
| __asm { vmovss xmm1, cs:dword_6004 } | |
| if ( v917 != v916 ) | |
| { | |
| do | |
| { | |
| __asm { vxorps xmm3, xmm3, xmm3 } | |
| _RDX += 4; | |
| v21 = 1664525 * v21 + 1013904223; | |
| __asm | |
| { | |
| vcvtsi2ss xmm0, xmm3, eax | |
| vmulss xmm0, xmm0, xmm1 | |
| vmovss dword ptr [rdx-4], xmm0 | |
| } | |
| } | |
| while ( _RDX != v20 ); | |
| } | |
| v24 = v920; | |
| if ( v920 != v919 ) | |
| { | |
| __asm { vmovss xmm1, cs:dword_6004 } | |
| _RDX = v919; | |
| v26 = 858993459; | |
| do | |
| { | |
| __asm { vxorps xmm3, xmm3, xmm3 } | |
| _RDX += 4; | |
| v26 = 1664525 * v26 + 1013904223; | |
| __asm | |
| { | |
| vcvtsi2ss xmm0, xmm3, eax | |
| vmulss xmm0, xmm0, xmm1 | |
| vmovss dword ptr [rdx-4], xmm0 | |
| } | |
| } | |
| while ( v24 != _RDX ); | |
| } | |
| std::vector<float>::operator=(v921, &v919, *(double *)&_XMM0, *(double *)&_XMM1); | |
| std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 1: SAXPY + cosine similarity ===\n"); | |
| v902 = 0; | |
| v903 = 0; | |
| std::vector<float>::operator=(v918, &v916, *(double *)&_XMM0, *(double *)&_XMM1); | |
| std::string::basic_string<std::allocator<char>>(v948, "saxpy_scalar"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v902); | |
| std::string::_M_dispose(v948); | |
| _RAX = v918[0]; | |
| _RDX = 0; | |
| if ( (unsigned __int64)(v918[0] - (_RBX + 4)) <= 0x18 ) | |
| { | |
| __asm { vmovss xmm1, cs:dword_6014 } | |
| do | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rbx+rdx*4] | |
| vfmadd213ss xmm0, xmm1, dword ptr [rax+rdx*4] | |
| vmovss dword ptr [rax+rdx*4], xmm0 | |
| } | |
| ++_RDX; | |
| } | |
| while ( _RDX != 0x400000 ); | |
| } | |
| else | |
| { | |
| __asm { vbroadcastss ymm1, cs:dword_6014 } | |
| do | |
| { | |
| __asm | |
| { | |
| vmovups ymm0, ymmword ptr [rbx+rdx] | |
| vfmadd213ps ymm0, ymm1, ymmword ptr [rax+rdx] | |
| vmovups ymmword ptr [rax+rdx], ymm0 | |
| } | |
| _RDX += 32; | |
| } | |
| while ( _RDX != 0x1000000 ); | |
| __asm { vzeroupper } | |
| } | |
| _R13D = 0; | |
| v34 = _RAX + 0x1000000; | |
| __asm { vmovd xmm0, r13d } | |
| do | |
| { | |
| __asm { vaddss xmm0, xmm0, dword ptr [rax] } | |
| _RAX += 32; | |
| __asm | |
| { | |
| vaddss xmm0, xmm0, dword ptr [rax-1Ch] | |
| vaddss xmm0, xmm0, dword ptr [rax-18h] | |
| vaddss xmm0, xmm0, dword ptr [rax-14h] | |
| vaddss xmm0, xmm0, dword ptr [rax-10h] | |
| vaddss xmm0, xmm0, dword ptr [rax-0Ch] | |
| vaddss xmm0, xmm0, dword ptr [rax-8] | |
| vaddss xmm0, xmm0, dword ptr [rax-4] | |
| } | |
| } | |
| while ( _RAX != v34 ); | |
| __asm { vmovd r13d, xmm0 } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::vector<float>::operator=(v918, &v916, *(double *)&_XMM0, *(double *)&_XMM1); | |
| std::string::basic_string<std::allocator<char>>(v948, "saxpy_avx"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v903); | |
| std::string::_M_dispose(v948); | |
| _RAX = v918[0]; | |
| _RDX = 8; | |
| __asm { vbroadcastss ymm1, cs:dword_6014 } | |
| do | |
| { | |
| __asm | |
| { | |
| vmovups ymm0, ymmword ptr [rbx+rdx*4-20h] | |
| vfmadd213ps ymm0, ymm1, ymmword ptr [rax+rdx*4-20h] | |
| vmovups ymmword ptr [rax+rdx*4-20h], ymm0 | |
| } | |
| _RDX += 8; | |
| } | |
| while ( _RDX != 4194312 ); | |
| _R14D = 0; | |
| v50 = _RAX + 0x1000000; | |
| __asm { vmovd xmm0, r14d } | |
| do | |
| { | |
| __asm { vaddss xmm0, xmm0, dword ptr [rax] } | |
| _RAX += 32; | |
| __asm | |
| { | |
| vaddss xmm0, xmm0, dword ptr [rax-1Ch] | |
| vaddss xmm0, xmm0, dword ptr [rax-18h] | |
| vaddss xmm0, xmm0, dword ptr [rax-14h] | |
| vaddss xmm0, xmm0, dword ptr [rax-10h] | |
| vaddss xmm0, xmm0, dword ptr [rax-0Ch] | |
| vaddss xmm0, xmm0, dword ptr [rax-8] | |
| vaddss xmm0, xmm0, dword ptr [rax-4] | |
| } | |
| } | |
| while ( v50 != _RAX ); | |
| __asm | |
| { | |
| vmovd r14d, xmm0 | |
| vzeroupper | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| v904 = 0; | |
| v905 = 0; | |
| std::string::basic_string<std::allocator<char>>(v948, "cosine_scalar"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v904); | |
| std::string::_M_dispose(v948); | |
| __asm { vxorpd xmm4, xmm4, xmm4 } | |
| v62 = 0; | |
| __asm | |
| { | |
| vmovsd xmm2, xmm4, xmm4 | |
| vmovsd xmm3, xmm4, xmm4 | |
| } | |
| do | |
| { | |
| __asm | |
| { | |
| vxorpd xmm7, xmm7, xmm7 | |
| vcvtss2sd xmm1, xmm7, dword ptr [rbx+rax*4] | |
| vcvtss2sd xmm0, xmm7, dword ptr [r12+rax*4] | |
| } | |
| ++v62; | |
| __asm | |
| { | |
| vfmadd231sd xmm3, xmm1, xmm0 | |
| vfmadd231sd xmm2, xmm1, xmm1 | |
| vfmadd231sd xmm4, xmm0, xmm0 | |
| } | |
| } | |
| while ( v62 != 0x400000 ); | |
| __asm | |
| { | |
| vmulsd xmm0, xmm2, xmm4; x | |
| vxorpd xmm1, xmm1, xmm1 | |
| vucomisd xmm1, xmm0 | |
| } | |
| __asm | |
| { | |
| vsqrtsd xmm0, xmm0, xmm0 | |
| vxorpd xmm1, xmm1, xmm1 | |
| vucomisd xmm0, xmm1 | |
| } | |
| if ( __SETP__(v62, 0x400000) ) | |
| { | |
| __asm | |
| { | |
| vdivsd xmm3, xmm3, xmm0 | |
| vcvtsd2ss xmm3, xmm3, xmm3 | |
| vmovd r12d, xmm3 | |
| } | |
| } | |
| else | |
| { | |
| _R12D = 0; | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::string::basic_string<std::allocator<char>>(v948, "cosine_avx"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v905); | |
| std::string::_M_dispose(v948); | |
| __asm { vxorps xmm2, xmm2, xmm2 } | |
| _RDX = v921[0]; | |
| _RAX = 0; | |
| __asm | |
| { | |
| vmovaps ymm3, ymm2 | |
| vmovaps ymm4, ymm2 | |
| } | |
| while ( 1 ) | |
| { | |
| _RAX += 8; | |
| if ( _RAX == 4194312 ) | |
| break; | |
| __asm | |
| { | |
| vmovups ymm1, ymmword ptr [rbx+rax*4-20h] | |
| vmovups ymm0, ymmword ptr [rdx+rax*4-20h] | |
| vfmadd231ps ymm3, ymm1, ymm1 | |
| vfmadd231ps ymm4, ymm1, ymm0 | |
| vfmadd231ps ymm2, ymm0, ymm0 | |
| } | |
| } | |
| __asm | |
| { | |
| vmovaps xmm0, xmm4 | |
| vextractf128 xmm4, ymm4, 1 | |
| vaddps xmm4, xmm0, xmm4 | |
| vmovshdup xmm0, xmm4 | |
| vaddps xmm4, xmm4, xmm0 | |
| vmovhlps xmm0, xmm0, xmm4 | |
| vaddss xmm4, xmm4, xmm0 | |
| vmovaps xmm0, xmm3 | |
| vextractf128 xmm3, ymm3, 1 | |
| vaddps xmm0, xmm0, xmm3 | |
| vmovshdup xmm1, xmm0 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovhlps xmm1, xmm1, xmm0 | |
| vaddss xmm0, xmm0, xmm1 | |
| vmovaps xmm1, xmm2 | |
| vextractf128 xmm2, ymm2, 1 | |
| vaddps xmm1, xmm1, xmm2 | |
| vcvtss2sd xmm0, xmm0, xmm0 | |
| vmovshdup xmm2, xmm1 | |
| vaddps xmm1, xmm1, xmm2 | |
| vmovhlps xmm2, xmm2, xmm1 | |
| vaddss xmm1, xmm1, xmm2 | |
| vcvtss2sd xmm1, xmm1, xmm1 | |
| vmulsd xmm0, xmm0, xmm1; x | |
| vxorpd xmm1, xmm1, xmm1 | |
| vucomisd xmm1, xmm0 | |
| } | |
| __asm | |
| { | |
| vsqrtsd xmm0, xmm0, xmm0 | |
| vzeroupper | |
| vxorpd xmm1, xmm1, xmm1 | |
| vucomisd xmm0, xmm1 | |
| } | |
| if ( __SETP__(_RAX, 4194312) ) | |
| { | |
| __asm | |
| { | |
| vcvtss2sd xmm4, xmm4, xmm4 | |
| vdivsd xmm0, xmm4, xmm0 | |
| vcvtsd2ss xmm3, xmm0, xmm0 | |
| vmovd r15d, xmm3 | |
| } | |
| } | |
| else | |
| { | |
| _R15D = 0; | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "SAXPY scalar: checksum=", | |
| 24, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v110); | |
| __asm { vmovd xmm3, r13d } | |
| __asm { vcvtss2sd xmm0, xmm3, xmm3 } | |
| v113 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v113, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v114); | |
| __asm { vmovsd xmm0, qword ptr [rbp-370h] } | |
| v116 = std::ostream::_M_insert<double>(v113, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v116, " ms\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "SAXPY AVX : checksum=", | |
| 24, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v117); | |
| __asm { vmovd xmm3, r14d } | |
| __asm { vcvtss2sd xmm0, xmm3, xmm3 } | |
| v120 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v120, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v121); | |
| __asm { vmovsd xmm0, qword ptr [rbp-368h] } | |
| v123 = std::ostream::_M_insert<double>(v120, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v123, " ms\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Cosine scalar: value=", | |
| 21, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v124); | |
| __asm { vmovd xmm3, r12d } | |
| __asm { vcvtss2sd xmm0, xmm3, xmm3 } | |
| v127 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v127, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v128); | |
| __asm { vmovsd xmm0, qword ptr [rbp-360h] } | |
| v130 = std::ostream::_M_insert<double>(v127, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v130, " ms\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Cosine AVX : value=", | |
| 21, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v131); | |
| __asm { vmovd xmm3, r15d } | |
| __asm { vcvtss2sd xmm0, xmm3, xmm3 } | |
| v134 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v134, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v135); | |
| __asm { vmovsd xmm0, qword ptr [rbp-358h] } | |
| v137 = std::ostream::_M_insert<double>(v134, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v137, " ms\n"); | |
| std::operator<<<std::char_traits<char>>(&std::cout, "--------------------------------------------------------\n\n"); | |
| v928 = 0x43800000780LL; | |
| std::vector<float>::vector(&v929, 2073600, v138); | |
| v931 = 0x43800000780LL; | |
| std::vector<float>::vector(v932, 2073600, v139); | |
| v933 = 0x43800000780LL; | |
| std::vector<float>::vector(v934, 2073600, v140); | |
| _R13 = v929; | |
| v142 = v930; | |
| if ( v929 != v930 ) | |
| { | |
| __asm { vmovss xmm1, cs:dword_6004 } | |
| _RDX = v929; | |
| v144 = -1515870811; | |
| do | |
| { | |
| __asm { vxorps xmm3, xmm3, xmm3 } | |
| _RDX += 4; | |
| v144 = 1664525 * v144 + 1013904223; | |
| __asm | |
| { | |
| vcvtsi2ss xmm0, xmm3, eax | |
| vmulss xmm0, xmm0, xmm1 | |
| vmovss dword ptr [rdx-4], xmm0 | |
| } | |
| } | |
| while ( _RDX != v142 ); | |
| } | |
| std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 2: 2D 5-point blur on 1080p image ===\n"); | |
| v906 = 0; | |
| v907 = 0; | |
| std::string::basic_string<std::allocator<char>>(v948, "blur_scalar"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v906); | |
| std::string::_M_dispose(v948); | |
| v146 = HIDWORD(v928); | |
| v147 = 4LL * (int)v928; | |
| LODWORD(v899) = v928; | |
| v898 = (int)v928; | |
| v892 = (char *)v932[0]; | |
| memcpy(v932[0], _R13, v147); | |
| v148 = v146 - 1; | |
| v149 = v146 - 1; | |
| v888 = v148; | |
| v150 = &_R13[v148 * v147]; | |
| v887 = (int)v931; | |
| v151 = v931; | |
| v885 = 4LL * (int)v931; | |
| memcpy(&v892[v148 * v885], v150, v147); | |
| if ( v888 > 1 ) | |
| { | |
| if ( v151 == 1 && (_RDX = _R13 + 4, v153 = 1, _RAX = v892 + 4, (_DWORD)v899 == 1) ) | |
| { | |
| do | |
| { | |
| __asm { vmovss xmm0, dword ptr [rdx] } | |
| ++v153; | |
| _RDX += 4; | |
| _RAX += 4; | |
| __asm | |
| { | |
| vmovss dword ptr [rax-4], xmm0 | |
| vmovss xmm0, dword ptr [rdx-4] | |
| vmovss dword ptr [rax-4], xmm0 | |
| } | |
| } | |
| while ( v888 != v153 ); | |
| v871 = &_R13[v147]; | |
| _RCX = &v892[v885]; | |
| } | |
| else | |
| { | |
| _RDI = v898; | |
| v860 = 1; | |
| v871 = &_R13[v147]; | |
| _RDX = &_R13[v147]; | |
| _RCX = &v892[v885]; | |
| _RAX = &v892[v885]; | |
| do | |
| { | |
| __asm { vmovss xmm0, dword ptr [rdx] } | |
| ++v860; | |
| __asm | |
| { | |
| vmovss dword ptr [rax], xmm0 | |
| vmovss xmm0, dword ptr [rdx+rdi*4-4] | |
| } | |
| _RDX += v147; | |
| __asm { vmovss dword ptr [rax+rdi*4-4], xmm0 } | |
| _RAX += v885; | |
| } | |
| while ( v888 != v860 ); | |
| } | |
| LODWORD(v900) = 0; | |
| __asm { vmovss xmm1, dword ptr cs:qword_6480 } | |
| v865 = v149; | |
| v895 = 1; | |
| v884 = v899 - 3; | |
| v157 = (unsigned int)(v899 - 3) + 2LL; | |
| v877 = v899 - 2; | |
| v158 = (v899 - 2) & 0xFFFFFFF8; | |
| v880 = v899 - 1; | |
| v875 = 32LL * ((unsigned int)(v899 - 2) >> 3); | |
| _RAX = v871; | |
| __asm { vbroadcastss ymm2, xmm1 } | |
| __asm { vshufps xmm3, xmm1, xmm1, 0 } | |
| v867 = v158 + 1; | |
| v160 = &v871[v147]; | |
| v866 = v158; | |
| v872 = v899 - v158; | |
| v868 = v899 - v158 - 2; | |
| v901 = v887; | |
| v161 = _R13; | |
| v864 = v150; | |
| v162 = v147; | |
| v163 = v898; | |
| while ( 1 ) | |
| { | |
| ++v895; | |
| v889 = v163; | |
| v163 += v898; | |
| if ( (int)v899 > 2 ) | |
| { | |
| if ( v884 <= 2 | |
| || (_R8 = _RCX + 4, | |
| _R9 = _RAX + 4, | |
| (unsigned __int64)(_RCX - _RAX) <= 0x20 || (unsigned __int64)(_RCX + 4 - (v161 + 8)) <= 0x18) | |
| || (unsigned __int64)(_R8 - (v160 + 8)) <= 0x18 ) | |
| { | |
| _RDI = 1; | |
| do | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax+rdi*4] | |
| vaddss xmm0, xmm0, dword ptr [rax+rdi*4-4] | |
| vaddss xmm0, xmm0, dword ptr [rax+rdi*4+4] | |
| vaddss xmm0, xmm0, dword ptr [rdx+rdi*4] | |
| vaddss xmm0, xmm0, dword ptr [rsi+rdi*4] | |
| vmulss xmm0, xmm0, xmm1 | |
| vaddss xmm6, xmm0, dword ptr [rbp-388h] | |
| vmovss dword ptr [rbp-388h], xmm6 | |
| vmovss dword ptr [rcx+rdi*4], xmm0 | |
| } | |
| ++_RDI; | |
| } | |
| while ( _RDI != v157 ); | |
| goto LABEL_53; | |
| } | |
| if ( v884 <= 6 ) | |
| { | |
| v187 = v877; | |
| v188 = 0; | |
| v189 = 1; | |
| } | |
| else | |
| { | |
| _RDI = 0; | |
| do | |
| { | |
| __asm | |
| { | |
| vmovups ymm5, ymmword ptr [r9+rdi] | |
| vaddps ymm0, ymm5, ymmword ptr [rax+rdi] | |
| vaddps ymm0, ymm0, ymmword ptr [rbx+rdi] | |
| vaddps ymm0, ymm0, ymmword ptr [r11+rdi] | |
| vaddps ymm0, ymm0, ymmword ptr [r10+rdi] | |
| vmovss xmm4, dword ptr [rbp-388h] | |
| vmulps ymm0, ymm0, ymm2 | |
| vaddss xmm4, xmm4, xmm0 | |
| vshufps xmm6, xmm0, xmm0, 55h ; 'U' | |
| vshufps xmm5, xmm0, xmm0, 0FFh | |
| vmovups ymmword ptr [r8+rdi], ymm0 | |
| } | |
| _RDI += 32; | |
| __asm | |
| { | |
| vaddss xmm4, xmm4, xmm6 | |
| vunpckhps xmm6, xmm0, xmm0 | |
| vextractf128 xmm0, ymm0, 1 | |
| vaddss xmm4, xmm4, xmm6 | |
| vaddss xmm4, xmm4, xmm5 | |
| vshufps xmm5, xmm0, xmm0, 55h ; 'U' | |
| vaddss xmm4, xmm4, xmm0 | |
| vaddss xmm4, xmm4, xmm5 | |
| vunpckhps xmm5, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm4, xmm4, xmm5 | |
| vaddss xmm5, xmm4, xmm0 | |
| vmovss dword ptr [rbp-388h], xmm5 | |
| } | |
| } | |
| while ( v875 != _RDI ); | |
| if ( (v877 & 7) == 0 ) | |
| goto LABEL_53; | |
| v187 = v868; | |
| if ( (unsigned int)(v872 - 3) <= 2 ) | |
| { | |
| v202 = v867; | |
| LABEL_50: | |
| _R9 = 4LL * v202; | |
| v882 = v202 + 1LL; | |
| __asm { vmovss xmm0, dword ptr [rax+r9-4] } | |
| __asm { vaddss xmm0, xmm0, dword ptr [rbx] } | |
| _R11 = &_RAX[_R9 + 4]; | |
| __asm | |
| { | |
| vaddss xmm0, xmm0, dword ptr [r11] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r9] | |
| vaddss xmm0, xmm0, dword ptr [r13+rbx*4+0] | |
| } | |
| __asm { vmulss xmm0, xmm0, xmm1 } | |
| _R8 = v901 + v202; | |
| _RBX = v892; | |
| __asm | |
| { | |
| vaddss xmm4, xmm0, dword ptr [rbp-388h] | |
| vmovss dword ptr [rbp-388h], xmm4 | |
| vmovss dword ptr [rbx+r8*4], xmm0 | |
| } | |
| if ( v880 > v202 + 1 ) | |
| { | |
| __asm { vmovss xmm0, dword ptr [r11] } | |
| v218 = v202 + 2; | |
| __asm { vaddss xmm0, xmm0, dword ptr [r8] } | |
| __asm | |
| { | |
| vaddss xmm0, xmm0, dword ptr [rbx] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8] | |
| } | |
| __asm { vaddss xmm0, xmm0, dword ptr [r13+rbx*4+0] } | |
| _RBX = v882 + v901; | |
| _R8 = v892; | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, xmm1 | |
| vaddss xmm5, xmm0, xmm4 | |
| vmovss dword ptr [r8+rbx*4], xmm0 | |
| vmovss dword ptr [rbp-388h], xmm5 | |
| } | |
| if ( v880 > v218 ) | |
| { | |
| _R8 = &_RAX[_R9 + 8]; | |
| __asm { vmovss xmm0, dword ptr [r8] } | |
| __asm | |
| { | |
| vaddss xmm0, xmm0, dword ptr [r11] | |
| vaddss xmm0, xmm0, dword ptr [rax+r9+0Ch] | |
| vaddss xmm0, xmm0, dword ptr [rdx+rbx] | |
| vaddss xmm0, xmm0, dword ptr [r13+r8*4+0] | |
| } | |
| _R8 = v892; | |
| _RDI = v901 + v218; | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, xmm1 | |
| vaddss xmm4, xmm0, xmm5 | |
| vmovss dword ptr [r8+rdi*4], xmm0 | |
| vmovss dword ptr [rbp-388h], xmm4 | |
| } | |
| } | |
| } | |
| goto LABEL_53; | |
| } | |
| v188 = v866; | |
| v189 = v867; | |
| } | |
| _R11 = v188 + v889 + 1; | |
| __asm | |
| { | |
| vmovups xmm4, xmmword ptr [r13+r11*4-4] | |
| vaddps xmm0, xmm4, xmmword ptr [r13+r11*4+0] | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r11*4+4] | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r8*4+0] | |
| } | |
| __asm { vmovss xmm4, dword ptr [rbp-388h] } | |
| _RDI = v901 + v188 + 1; | |
| _RBX = v892; | |
| __asm | |
| { | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r8*4+0] | |
| vmulps xmm0, xmm0, xmm3 | |
| vaddss xmm4, xmm4, xmm0 | |
| vshufps xmm5, xmm0, xmm0, 55h ; 'U' | |
| vmovups xmmword ptr [rbx+rdi*4], xmm0 | |
| } | |
| v202 = v189 + (v187 & 0xFFFFFFFC); | |
| __asm | |
| { | |
| vaddss xmm4, xmm4, xmm5 | |
| vunpckhps xmm5, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm4, xmm4, xmm5 | |
| vaddss xmm4, xmm4, xmm0 | |
| vmovss dword ptr [rbp-388h], xmm4 | |
| } | |
| if ( (v187 & 3) != 0 ) | |
| goto LABEL_50; | |
| } | |
| LABEL_53: | |
| _RAX += v162; | |
| v161 += v162; | |
| v160 += v162; | |
| v901 += v887; | |
| _RCX += v885; | |
| if ( v888 == v895 ) | |
| { | |
| v147 = v162; | |
| v149 = v865; | |
| v150 = v864; | |
| __asm { vzeroupper } | |
| goto LABEL_55; | |
| } | |
| } | |
| } | |
| LODWORD(v900) = 0; | |
| LABEL_55: | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::string::basic_string<std::allocator<char>>(v948, "blur_avx"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v907); | |
| std::string::_M_dispose(v948); | |
| v890 = (char *)v934[0]; | |
| memcpy(v934[0], _R13, v147); | |
| v235 = 4LL * (int)v933; | |
| LODWORD(v901) = v933; | |
| v869 = (int)v933; | |
| memcpy(&v890[v235 * v149], v150, v147); | |
| if ( v888 <= 1 ) | |
| { | |
| LODWORD(v901) = 0; | |
| goto LABEL_77; | |
| } | |
| v236 = v901; | |
| if ( (_DWORD)v901 == 1 && (_RDX = _R13 + 4, _RAX = v890 + 4, (_DWORD)v899 == 1) ) | |
| { | |
| do | |
| { | |
| __asm { vmovss xmm0, dword ptr [rdx] } | |
| ++v236; | |
| _RDX += 4; | |
| _RAX += 4; | |
| __asm | |
| { | |
| vmovss dword ptr [rax-4], xmm0 | |
| vmovss xmm0, dword ptr [rdx-4] | |
| vmovss dword ptr [rax-4], xmm0 | |
| } | |
| } | |
| while ( v888 != v236 ); | |
| } | |
| else | |
| { | |
| v854 = 1; | |
| _RSI = v898; | |
| _RDX = &_R13[v147]; | |
| _RAX = &v890[v235]; | |
| do | |
| { | |
| __asm { vmovss xmm0, dword ptr [rdx] } | |
| ++v854; | |
| __asm | |
| { | |
| vmovss dword ptr [rax], xmm0 | |
| vmovss xmm0, dword ptr [rdx+rsi*4-4] | |
| } | |
| _RDX += v147; | |
| __asm { vmovss dword ptr [rax+rsi*4-4], xmm0 } | |
| _RAX += v235; | |
| } | |
| while ( v888 != v854 ); | |
| } | |
| v896 = 0; | |
| __asm { vmovss xmm3, dword ptr cs:qword_6480 } | |
| LODWORD(v901) = 0; | |
| v240 = v898; | |
| v886 = 1; | |
| v883 = v899 - 1; | |
| __asm | |
| { | |
| vbroadcastss ymm2, dword ptr cs:qword_6480 | |
| vshufps xmm4, xmm3, xmm3, 0 | |
| } | |
| __asm { vmovq xmm5, cs:qword_6480 } | |
| v243 = 8 * ((unsigned int)(v899 - 9) >> 3) + 17LL; | |
| v893 = v869; | |
| do | |
| { | |
| v244 = v240; | |
| _RAX = &_R13[4 * v240]; | |
| ++v886; | |
| v240 += v898; | |
| _RSI = &v890[4 * v893]; | |
| if ( (int)v899 <= 8 ) | |
| { | |
| v260 = 1; | |
| } | |
| else | |
| { | |
| for ( _RDI = 9; ; _RDI += 8 ) | |
| { | |
| __asm | |
| { | |
| vmovups ymm7, ymmword ptr [rax+rdi*4-20h] | |
| vaddps ymm0, ymm7, ymmword ptr [rax+rdi*4-24h] | |
| } | |
| __asm | |
| { | |
| vaddps ymm0, ymm0, ymmword ptr [rax+rdi*4-1Ch] | |
| vaddps ymm0, ymm0, ymmword ptr [rcx+rdi*4-20h] | |
| vaddps ymm0, ymm0, ymmword ptr [rdx+rdi*4-20h] | |
| vmulps ymm0, ymm0, ymm2 | |
| vmovaps xmm1, xmm0 | |
| vmovups ymmword ptr [rsi+rdi*4-20h], ymm0 | |
| vextractf128 xmm0, ymm0, 1 | |
| vaddps xmm0, xmm1, xmm0 | |
| vmovshdup xmm1, xmm0 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovhlps xmm1, xmm1, xmm0 | |
| vaddss xmm0, xmm0, xmm1 | |
| vaddss xmm6, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( _RDI + 8 == v243 ) | |
| break; | |
| } | |
| v260 = _RDI; | |
| } | |
| if ( v260 < v883 ) | |
| { | |
| _RDI = v260; | |
| v262 = v899 - v260; | |
| if ( (_DWORD)v899 - v260 != 2 ) | |
| { | |
| v878 = v260 + v244; | |
| v879 = v260 + v893; | |
| _R8 = &v890[4 * v879]; | |
| v876 = v260 + v896; | |
| v873 = v240 + v260; | |
| if ( (unsigned __int64)(_R8 - &_R13[4 * v873 + 4]) > 8 && (unsigned __int64)(_R8 - &_R13[4 * v876 + 4]) > 8 ) | |
| { | |
| _R14 = &_R13[4 * v878]; | |
| if ( (unsigned __int64)(_R8 - _R14) > 0x10 ) | |
| { | |
| v265 = v262 - 1; | |
| if ( (unsigned int)(v262 - 2) > 2 ) | |
| { | |
| __asm { vmovups xmm0, xmmword ptr [r14] } | |
| __asm | |
| { | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r9-4] | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r9+4] | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r12-4] | |
| vaddps xmm0, xmm0, xmmword ptr [r13+r14-4] | |
| vmovss xmm1, dword ptr [rbp-380h] | |
| vmulps xmm0, xmm0, xmm4 | |
| vaddss xmm1, xmm1, xmm0 | |
| vshufps xmm6, xmm0, xmm0, 55h ; 'U' | |
| vmovups xmmword ptr [r8], xmm0 | |
| vaddss xmm1, xmm1, xmm6 | |
| vunpckhps xmm6, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm6 | |
| vaddss xmm6, xmm1, xmm0 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( (v265 & 3) == 0 ) | |
| goto LABEL_75; | |
| v278 = v265 & 0xFFFFFFFC; | |
| v279 = v262 - v278; | |
| v260 += v278; | |
| v280 = v279 - 1; | |
| if ( v279 != 2 ) | |
| { | |
| LABEL_72: | |
| _R8 = 4 * (v278 + v878); | |
| __asm { vmovq xmm1, qword ptr [r13+r10*4+0] } | |
| __asm { vmovq xmm0, qword ptr [r13+r8-4] } | |
| __asm | |
| { | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovq xmm1, qword ptr [r13+r8+4] | |
| } | |
| _R8 = v278 + v876; | |
| __asm | |
| { | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovq xmm1, qword ptr [r13+r8*4+0] | |
| } | |
| _R8 = v278 + v873; | |
| _RDI = v278 + v879; | |
| _R10 = v890; | |
| __asm | |
| { | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovq xmm1, qword ptr [r13+r8*4+0] | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovss xmm1, dword ptr [rbp-380h] | |
| vmulps xmm0, xmm0, xmm5 | |
| vaddss xmm1, xmm1, xmm0 | |
| vmovlps qword ptr [r10+rdi*4], xmm0 | |
| vmovshdup xmm0, xmm0 | |
| vaddss xmm7, xmm0, xmm1 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( (v280 & 1) == 0 ) | |
| goto LABEL_75; | |
| v260 += v280 & 0xFFFFFFFE; | |
| } | |
| _R8 = 4LL * v260; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax+r8-4] | |
| vaddss xmm0, xmm0, dword ptr [rax+rdi*4] | |
| vaddss xmm0, xmm0, dword ptr [rax+r8+4] | |
| vaddss xmm0, xmm0, dword ptr [rcx+rdi*4] | |
| vaddss xmm0, xmm0, dword ptr [rdx+rdi*4] | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm6, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| vmovss dword ptr [rsi+rdi*4], xmm0 | |
| } | |
| goto LABEL_75; | |
| } | |
| v280 = v262 - 1; | |
| v278 = 0; | |
| goto LABEL_72; | |
| } | |
| } | |
| } | |
| _R8 = 4LL * v260; | |
| _R10 = &_RAX[_R8]; | |
| _R9 = (__int64)&_RAX[_R8 + 4]; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax+r8-4] | |
| vaddss xmm0, xmm0, dword ptr [r10] | |
| vaddss xmm0, xmm0, dword ptr [r9] | |
| vaddss xmm0, xmm0, dword ptr [rcx+rdi*4] | |
| vaddss xmm0, xmm0, dword ptr [rdx+rdi*4] | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm7, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| vmovss dword ptr [rsi+rdi*4], xmm0 | |
| } | |
| if ( v883 > v260 + 1 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [r10] | |
| vaddss xmm0, xmm0, dword ptr [r9] | |
| vaddss xmm0, xmm0, dword ptr [rdi] | |
| vaddss xmm0, xmm0, dword ptr [rcx+r8+4] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8+4] | |
| } | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm6, xmm0, xmm7 | |
| vmovss dword ptr [rsi+r8+4], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( v260 + 2 < v883 ) | |
| { | |
| _R10 = (__int64)&_RAX[_R8 + 12]; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [r9] | |
| vaddss xmm0, xmm0, dword ptr [rdi] | |
| vaddss xmm0, xmm0, dword ptr [r10] | |
| vaddss xmm0, xmm0, dword ptr [rcx+r8+8] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8+8] | |
| } | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm6, xmm0, xmm6 | |
| vmovss dword ptr [rsi+r8+8], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( v883 > v260 + 3 ) | |
| { | |
| __asm { vmovss xmm0, dword ptr [rax+r8+8] } | |
| __asm | |
| { | |
| vaddss xmm0, xmm0, dword ptr [r10] | |
| vaddss xmm0, xmm0, dword ptr [rdi] | |
| vaddss xmm0, xmm0, dword ptr [rcx+r8+0Ch] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8+0Ch] | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm7, xmm0, xmm6 | |
| vmovss dword ptr [rsi+r8+0Ch], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( v883 > v260 + 4 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [r10] | |
| vaddss xmm0, xmm0, dword ptr [rdi] | |
| vaddss xmm0, xmm0, dword ptr [r9] | |
| vaddss xmm0, xmm0, dword ptr [rcx+r8+10h] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8+10h] | |
| } | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm6, xmm0, xmm7 | |
| vmovss dword ptr [rsi+r8+10h], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( v883 > v260 + 5 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax+r8+10h] | |
| vaddss xmm0, xmm0, dword ptr [r9] | |
| vaddss xmm0, xmm0, dword ptr [rdi] | |
| vaddss xmm0, xmm0, dword ptr [rcx+r8+14h] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8+14h] | |
| } | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm7, xmm0, xmm6 | |
| vmovss dword ptr [rsi+r8+14h], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( v883 > v260 + 6 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax+r8+14h] | |
| vaddss xmm0, xmm0, dword ptr [rdi] | |
| vaddss xmm0, xmm0, dword ptr [rax+r8+1Ch] | |
| vaddss xmm0, xmm0, dword ptr [rcx+r8+18h] | |
| vaddss xmm0, xmm0, dword ptr [rdx+r8+18h] | |
| vmulss xmm0, xmm0, xmm3 | |
| vaddss xmm6, xmm0, xmm7 | |
| vmovss dword ptr [rsi+r8+18h], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| LABEL_75: | |
| v893 += v869; | |
| v896 += v898; | |
| } | |
| while ( v888 != v886 ); | |
| __asm { vzeroupper } | |
| LABEL_77: | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Blur scalar: checksum=", | |
| 22, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v305); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-388h] } | |
| v307 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v307, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v308); | |
| __asm { vmovsd xmm0, qword ptr [rbp-350h] } | |
| v310 = std::ostream::_M_insert<double>(v307, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v310, " ms\n"); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Blur AVX : checksum=", | |
| 22, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v311); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-380h] } | |
| v313 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v313, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v314); | |
| __asm { vmovsd xmm0, qword ptr [rbp-348h] } | |
| v316 = std::ostream::_M_insert<double>(v313, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v316, " ms\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Checksum delta (AVX - scalar): ", | |
| 31, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v317); | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rbp-380h] | |
| vsubss xmm0, xmm3, dword ptr [rbp-388h] | |
| } | |
| __asm { vcvtss2sd xmm0, xmm0, xmm0 } | |
| v321 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v321, "\n"); | |
| std::operator<<<std::char_traits<char>>(&std::cout, "--------------------------------------------------------\n\n"); | |
| ComplexSoA::ComplexSoA((ComplexSoA *)&v935, 0x40000u); | |
| ComplexSoA::ComplexSoA((ComplexSoA *)v938, 0x40000u); | |
| ComplexSoA::ComplexSoA((ComplexSoA *)v940, 0); | |
| ComplexSoA::ComplexSoA((ComplexSoA *)v942, 0); | |
| ComplexSoA::ComplexSoA((ComplexSoA *)v944, 0); | |
| ComplexSoA::ComplexSoA((ComplexSoA *)v946, 0); | |
| fill_complex((ComplexSoA *)&v935, 0x1234ABCDu); | |
| fill_complex((ComplexSoA *)v938, 0x9876FEDC); | |
| std::vector<float>::vector(&v922, 16, v322); | |
| _R12 = v922; | |
| __asm { vmovss xmm2, cs:dword_6008 } | |
| _RBX = 0; | |
| __asm { vxorps xmm1, xmm1, xmm1 } | |
| while ( 1 ) | |
| { | |
| __asm | |
| { | |
| vsubss xmm0, xmm1, cs:dword_6018 | |
| vmulss xmm0, xmm0, cs:dword_601C | |
| vmovss xmm3, cs:dword_6020 | |
| vxorps xmm1, xmm0, cs:xmmword_6490 | |
| vfnmadd132ss xmm2, xmm3, xmm3 | |
| vmulss xmm0, xmm1, xmm0; x | |
| vmovss dword ptr [rbp-380h], xmm2 | |
| } | |
| *(float *)&_XMM0 = expf(*(float *)&_XMM0); | |
| __asm | |
| { | |
| vmulss xmm0, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [r12+rbx*4], xmm0 | |
| } | |
| if ( ++_RBX == 16 ) | |
| break; | |
| __asm | |
| { | |
| vxorps xmm3, xmm3, xmm3 | |
| vcvtsi2ss xmm1, xmm3, rbx | |
| vaddss xmm0, xmm1, cs:dword_6020 | |
| vmulss xmm0, xmm0, cs:dword_6024 | |
| vmulss xmm0, xmm0, cs:dword_6028; x | |
| vmovss dword ptr [rbp-380h], xmm1 | |
| } | |
| *(float *)&_XMM0 = cosf(*(float *)&_XMM0); | |
| __asm | |
| { | |
| vmovss xmm1, dword ptr [rbp-380h] | |
| vmovaps xmm2, xmm0 | |
| } | |
| } | |
| v891 = _R12; | |
| std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 3: Complex multiply + FIR convolution ===\n"); | |
| v908 = 0; | |
| v909 = 0; | |
| std::string::basic_string<std::allocator<char>>(v948, "complex_mul_scalar"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v908); | |
| std::string::_M_dispose(v948); | |
| _RBX = v935; | |
| v874 = v936; | |
| v340 = v936 - v935; | |
| v900 = v936 - v935; | |
| v341 = (v936 - v935) >> 2; | |
| std::vector<float>::resize(v940, v341); | |
| std::vector<float>::resize(v941, v340 >> 2); | |
| if ( v340 ) | |
| { | |
| v900 = v340; | |
| _RCX = v937; | |
| _RSI = v938[0]; | |
| _RDI = v939; | |
| _RAX = v940[0]; | |
| _RDX = v941[0]; | |
| if ( (unsigned __int64)v340 > 0xC | |
| && (unsigned __int64)(v941[0] - (v939 + 4)) > 0x18 | |
| && (unsigned __int64)(v940[0] - (v939 + 4)) > 0x18 | |
| && (unsigned __int64)(v940[0] - (v938[0] + 4LL)) > 0x18 | |
| && (unsigned __int64)(v941[0] - (_RBX + 4)) > 0x18 | |
| && (unsigned __int64)(v940[0] - (_RBX + 4)) > 0x18 | |
| && (unsigned __int64)(v940[0] - (v937 + 4)) > 0x18 | |
| && (unsigned __int64)(v941[0] - (v937 + 4)) > 0x18 | |
| && (unsigned __int64)(v941[0] - (v938[0] + 4LL)) > 0x18 | |
| && (unsigned __int64)(v941[0] - (v940[0] + 4LL)) > 0x18 ) | |
| { | |
| if ( v900 <= 0x1C ) | |
| { | |
| LODWORD(v898) = 0; | |
| _R8 = 0; | |
| goto LABEL_89; | |
| } | |
| _R8 = 0; | |
| LODWORD(v898) = 0; | |
| __asm { vbroadcastss ymm3, cs:dword_602C } | |
| __asm { vbroadcastss ymm2, cs:dword_6020 } | |
| do | |
| { | |
| __asm | |
| { | |
| vmovups ymm4, ymmword ptr [rcx+r8] | |
| vmulps ymm1, ymm4, ymmword ptr [rdi+r8] | |
| vmulps ymm0, ymm4, ymmword ptr [rsi+r8] | |
| vmovups ymm5, ymmword ptr [rbx+r8] | |
| vfmsub231ps ymm1, ymm5, ymmword ptr [rsi+r8] | |
| vfmadd231ps ymm0, ymm5, ymmword ptr [rdi+r8] | |
| vmovups ymmword ptr [rax+r8], ymm1 | |
| vmovups ymmword ptr [rdx+r8], ymm0 | |
| vmulps ymm0, ymm0, ymm3 | |
| } | |
| _R8 += 32; | |
| __asm | |
| { | |
| vfmadd132ps ymm1, ymm0, ymm2 | |
| vmovss xmm0, dword ptr [rbp-398h] | |
| vaddss xmm0, xmm0, xmm1 | |
| vshufps xmm5, xmm1, xmm1, 55h ; 'U' | |
| vshufps xmm4, xmm1, xmm1, 0FFh | |
| vaddss xmm0, xmm0, xmm5 | |
| vunpckhps xmm5, xmm1, xmm1 | |
| vextractf128 xmm1, ymm1, 1 | |
| vaddss xmm0, xmm0, xmm5 | |
| vaddss xmm0, xmm0, xmm4 | |
| vshufps xmm4, xmm1, xmm1, 55h ; 'U' | |
| vaddss xmm0, xmm0, xmm1 | |
| vaddss xmm0, xmm0, xmm4 | |
| vunpckhps xmm4, xmm1, xmm1 | |
| vshufps xmm1, xmm1, xmm1, 0FFh | |
| vaddss xmm0, xmm0, xmm4 | |
| vaddss xmm4, xmm0, xmm1 | |
| vmovss dword ptr [rbp-398h], xmm4 | |
| } | |
| } | |
| while ( _R8 != 32 * (v341 >> 3) ); | |
| _R8 = v341 & 0xFFFFFFFFFFFFFFF8LL; | |
| if ( (v341 & 7) != 0 ) | |
| { | |
| __asm { vzeroupper } | |
| LABEL_89: | |
| v370 = v341 - _R8; | |
| if ( v341 - _R8 - 1 <= 2 ) | |
| goto LABEL_229; | |
| __asm | |
| { | |
| vmovups xmm3, xmmword ptr [rcx+r8*4] | |
| vmulps xmm0, xmm3, xmmword ptr [rdi+r8*4] | |
| } | |
| _R10 = 4 * _R8; | |
| __asm | |
| { | |
| vbroadcastss xmm2, cs:dword_602C | |
| vmovups xmm3, xmmword ptr [rbx+r8*4] | |
| vfmsub231ps xmm0, xmm3, xmmword ptr [rsi+r8*4] | |
| vmovups xmm3, xmmword ptr [rcx+r8*4] | |
| vmulps xmm1, xmm3, xmmword ptr [rsi+r8*4] | |
| vmovups xmm3, xmmword ptr [rbx+r8*4] | |
| vfmadd231ps xmm1, xmm3, xmmword ptr [rdi+r8*4] | |
| vmovups xmmword ptr [rax+r10], xmm0 | |
| vmovups xmmword ptr [rdx+r10], xmm1 | |
| vmulps xmm1, xmm1, xmm2 | |
| } | |
| __asm { vbroadcastss xmm2, cs:dword_6020 } | |
| _R8 += v370 & 0xFFFFFFFFFFFFFFFCLL; | |
| __asm | |
| { | |
| vfmadd132ps xmm0, xmm1, xmm2 | |
| vmovss xmm1, dword ptr [rbp-398h] | |
| vaddss xmm1, xmm1, xmm0 | |
| vshufps xmm2, xmm0, xmm0, 55h ; 'U' | |
| vaddss xmm1, xmm1, xmm2 | |
| vunpckhps xmm2, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm2 | |
| vaddss xmm3, xmm1, xmm0 | |
| vmovss dword ptr [rbp-398h], xmm3 | |
| } | |
| if ( (v370 & 3) != 0 ) | |
| { | |
| LABEL_229: | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rcx+r8*4] | |
| vmovss xmm0, dword ptr [rdi+r8*4] | |
| } | |
| _R9 = 4 * _R8; | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rbx+r8*4] | |
| vmovss xmm2, dword ptr [rsi+r8*4] | |
| vmulss xmm1, xmm0, xmm4 | |
| vfmsub231ss xmm1, xmm2, xmm3 | |
| vmulss xmm2, xmm2, xmm4 | |
| vmovss dword ptr [rax+r8*4], xmm1 | |
| vfmadd132ss xmm0, xmm2, xmm3 | |
| vmovss xmm3, cs:dword_602C | |
| vmovss dword ptr [rdx+r8*4], xmm0 | |
| vmulss xmm0, xmm0, xmm3 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, dword ptr [rbp-398h] | |
| vmovss dword ptr [rbp-398h], xmm6 | |
| } | |
| if ( _R8 + 1 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm2, dword ptr [rcx+r9+4] | |
| vmovss xmm5, dword ptr [rsi+r9+4] | |
| } | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rdi+r9+4] | |
| vmovss xmm0, dword ptr [rbx+r9+4] | |
| vmulss xmm1, xmm2, xmm4 | |
| vmulss xmm2, xmm2, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm2, xmm4 | |
| vmovss dword ptr [rax+r9+4], xmm1 | |
| vmovss dword ptr [rdx+r9+4], xmm0 | |
| vmulss xmm0, xmm0, xmm3 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, xmm6 | |
| vmovss dword ptr [rbp-398h], xmm6 | |
| } | |
| if ( _R8 + 2 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm2, dword ptr [rcx+r9+8] | |
| vmovss xmm5, dword ptr [rsi+r9+8] | |
| vmovss xmm4, dword ptr [rdi+r9+8] | |
| vmovss xmm0, dword ptr [rbx+r9+8] | |
| vmulss xmm1, xmm2, xmm4 | |
| vmulss xmm2, xmm2, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm2, xmm4 | |
| vmovss dword ptr [rax+r9+8], xmm1 | |
| vmovss dword ptr [rdx+r9+8], xmm0 | |
| vmulss xmm0, xmm0, xmm3 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm3, xmm1, xmm6 | |
| vmovss dword ptr [rbp-398h], xmm3 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| else | |
| { | |
| __asm { vzeroupper } | |
| } | |
| } | |
| else | |
| { | |
| LODWORD(v898) = 0; | |
| __asm { vmovss xmm5, cs:dword_602C } | |
| _R8 = 0; | |
| do | |
| { | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rdi+r8*4] | |
| vmovss xmm2, dword ptr [rcx+r8*4] | |
| vmovss xmm4, dword ptr [rsi+r8*4] | |
| vmovss xmm0, dword ptr [rbx+r8*4] | |
| vmulss xmm1, xmm2, xmm3 | |
| vmulss xmm2, xmm2, xmm4 | |
| vfmsub231ss xmm1, xmm0, xmm4 | |
| vfmadd132ss xmm0, xmm2, xmm3 | |
| vmovss dword ptr [rax+r8*4], xmm1 | |
| vmovss dword ptr [rdx+r8*4], xmm0 | |
| vmulss xmm0, xmm0, xmm5 | |
| } | |
| ++_R8; | |
| __asm | |
| { | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm3, xmm1, dword ptr [rbp-398h] | |
| vmovss dword ptr [rbp-398h], xmm3 | |
| } | |
| } | |
| while ( _R8 < v341 ); | |
| } | |
| } | |
| else | |
| { | |
| LODWORD(v898) = 0; | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::string::basic_string<std::allocator<char>>(v948, "complex_mul_avx"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v909); | |
| std::string::_M_dispose(v948); | |
| std::vector<float>::resize(v942, v341); | |
| std::vector<float>::resize(v943, v341); | |
| if ( v900 <= 0x1C ) | |
| { | |
| LODWORD(v901) = 0; | |
| _RAX = 0; | |
| } | |
| else | |
| { | |
| LODWORD(v901) = 0; | |
| _R9 = v937; | |
| _RAX = 8; | |
| _RSI = v942[0]; | |
| _RCX = v943[0]; | |
| __asm { vbroadcastss ymm3, cs:dword_602C } | |
| __asm { vbroadcastss ymm2, cs:dword_6020 } | |
| while ( 1 ) | |
| { | |
| __asm | |
| { | |
| vmovups ymm5, ymmword ptr [r9+rax*4-20h] | |
| vmulps ymm0, ymm5, ymmword ptr [rdi+rax*4-20h] | |
| } | |
| __asm | |
| { | |
| vmovups ymm4, ymmword ptr [rbx+rax*4-20h] | |
| vfmsub231ps ymm0, ymm4, ymmword ptr [r8+rax*4-20h] | |
| vmulps ymm1, ymm5, ymmword ptr [r8+rax*4-20h] | |
| vfmadd231ps ymm1, ymm4, ymmword ptr [rdi+rax*4-20h] | |
| vmovups ymmword ptr [rsi+rax*4-20h], ymm0 | |
| vmulps ymm0, ymm0, ymm2 | |
| vmovups ymmword ptr [rcx+rax*4-20h], ymm1 | |
| vfmadd132ps ymm1, ymm0, ymm3 | |
| vmovaps xmm0, xmm1 | |
| vextractf128 xmm1, ymm1, 1 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovshdup xmm1, xmm0 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovhlps xmm1, xmm1, xmm0 | |
| vaddss xmm0, xmm0, xmm1 | |
| vaddss xmm5, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm5 | |
| } | |
| if ( v341 < _RAX + 8 ) | |
| break; | |
| _RAX += 8LL; | |
| } | |
| } | |
| if ( _RAX < v341 ) | |
| { | |
| _R15 = v937; | |
| _RDX = 4 * _RAX; | |
| _R8 = v938[0]; | |
| _RSI = v939; | |
| v899 = v943[0]; | |
| _RDI = v942[0]; | |
| v881 = v341 - _RAX; | |
| v870 = v341 - _RAX - 1; | |
| if ( v870 <= 2 ) | |
| goto LABEL_187; | |
| _RDX = 4 * _RAX; | |
| v443 = 4 * _RAX + 4; | |
| v894 = v942[0] + 4 * _RAX; | |
| v897 = v899 + 4 * _RAX; | |
| v444 = v894 - (_RBX + v443); | |
| v445 = v444 < 0x18; | |
| v446 = v444 == 24; | |
| _R12 = v894; | |
| v448 = !v445 | |
| && !v446 | |
| && (unsigned __int64)(v897 - (_RBX + v443)) > 0x18 | |
| && (unsigned __int64)(v894 - (v937 + v443)) > 0x18; | |
| v449 = v897 - (v937 + v443); | |
| v445 = v449 < 0x18; | |
| v446 = v449 == 24; | |
| _R14 = v897; | |
| if ( (unsigned __int64)(v897 - (v939 + v443)) <= 0x18 | |
| || (unsigned __int64)(v897 - (v938[0] + v443)) <= 0x18 | |
| || v445 | |
| || v446 | |
| || !v448 | |
| || (unsigned __int64)(v894 - (v938[0] + v443)) <= 0x18 | |
| || (unsigned __int64)(v894 - (v939 + v443)) <= 0x18 ) | |
| { | |
| goto LABEL_187; | |
| } | |
| if ( (unsigned __int64)(v897 - (v942[0] + v443)) > 0x18 ) | |
| { | |
| if ( v870 <= 6 ) | |
| { | |
| _RCX = _RAX; | |
| LABEL_107: | |
| __asm | |
| { | |
| vmovups xmm4, xmmword ptr [r15+rcx*4] | |
| vmovups xmm1, xmmword ptr [rsi+rcx*4] | |
| } | |
| _RDX = 4 * _RCX; | |
| __asm | |
| { | |
| vmovups xmm3, xmmword ptr [rbx+rcx*4] | |
| vmovups xmm2, xmmword ptr [r8+rcx*4] | |
| vmulps xmm0, xmm1, xmm4 | |
| } | |
| _RCX = v899; | |
| __asm | |
| { | |
| vfmsub231ps xmm0, xmm2, xmm3 | |
| vmulps xmm2, xmm2, xmm4 | |
| vmovups xmmword ptr [rdi+rdx], xmm0 | |
| vfmadd132ps xmm1, xmm2, xmm3 | |
| vbroadcastss xmm2, cs:dword_602C | |
| vmovups xmmword ptr [rcx+rdx], xmm1 | |
| vmulps xmm1, xmm1, xmm2 | |
| } | |
| __asm { vbroadcastss xmm2, cs:dword_6020 } | |
| _RAX += v881 & 0xFFFFFFFFFFFFFFFCLL; | |
| __asm | |
| { | |
| vfmadd132ps xmm0, xmm1, xmm2 | |
| vmovss xmm1, dword ptr [rbp-380h] | |
| vaddss xmm1, xmm1, xmm0 | |
| vshufps xmm2, xmm0, xmm0, 55h ; 'U' | |
| vaddss xmm1, xmm1, xmm2 | |
| vunpckhps xmm2, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm2 | |
| vaddss xmm3, xmm1, xmm0 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| if ( (v881 & 3) != 0 ) | |
| { | |
| LABEL_108: | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [r15+rax*4] | |
| vmovss xmm0, dword ptr [rsi+rax*4] | |
| } | |
| _RDX = 4 * _RAX; | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rbx+rax*4] | |
| vmovss xmm2, dword ptr [r8+rax*4] | |
| vmulss xmm1, xmm0, xmm4 | |
| } | |
| _R11 = v899; | |
| __asm | |
| { | |
| vfmsub231ss xmm1, xmm2, xmm3 | |
| vmulss xmm2, xmm2, xmm4 | |
| vmovss dword ptr [rdi+rax*4], xmm1 | |
| vfmadd132ss xmm0, xmm2, xmm3 | |
| vmovss xmm3, cs:dword_602C | |
| vmovss dword ptr [r11+rax*4], xmm0 | |
| vmulss xmm0, xmm0, xmm3 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( _RAX + 1 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rsi+rdx+4] | |
| vmovss xmm0, dword ptr [rbx+rdx+4] | |
| } | |
| __asm | |
| { | |
| vmovss xmm2, dword ptr [r15+rdx+4] | |
| vmovss xmm5, dword ptr [r8+rdx+4] | |
| vmulss xmm1, xmm2, xmm4 | |
| vmulss xmm2, xmm2, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm2, xmm4 | |
| vmovss dword ptr [rdi+rdx+4], xmm1 | |
| vmovss dword ptr [r11+rdx+4], xmm0 | |
| vmulss xmm0, xmm0, xmm3 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm7, xmm1, xmm6 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( _RAX + 2 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm5, dword ptr [r15+rdx+8] | |
| vmovss xmm0, dword ptr [rsi+rdx+8] | |
| vmovss xmm4, dword ptr [rbx+rdx+8] | |
| vmovss xmm2, dword ptr [r8+rdx+8] | |
| vmulss xmm1, xmm0, xmm5 | |
| vfmsub231ss xmm1, xmm2, xmm4 | |
| vmulss xmm2, xmm2, xmm5 | |
| vmovss dword ptr [rdi+rdx+8], xmm1 | |
| vfmadd132ss xmm0, xmm2, xmm4 | |
| vmovss dword ptr [r11+rdx+8], xmm0 | |
| vmulss xmm0, xmm0, xmm3 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm3, xmm1, xmm7 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| else | |
| { | |
| __asm | |
| { | |
| vmovups ymm4, ymmword ptr [r15+rdx] | |
| vmovups ymm1, ymmword ptr [rsi+rdx] | |
| vmovups ymm3, ymmword ptr [rbx+rdx] | |
| vmovups ymm2, ymmword ptr [r8+rdx] | |
| vmulps ymm0, ymm1, ymm4 | |
| } | |
| v455 = v341 - _RAX; | |
| v456 = v881 & 0xFFFFFFFFFFFFFFF8LL; | |
| _RCX = _RAX + (v881 & 0xFFFFFFFFFFFFFFF8LL); | |
| __asm { vfmsub231ps ymm0, ymm2, ymm3 } | |
| _RAX = _RCX; | |
| __asm | |
| { | |
| vmulps ymm2, ymm2, ymm4 | |
| vmovups ymmword ptr [r12], ymm0 | |
| vfmadd132ps ymm1, ymm2, ymm3 | |
| vbroadcastss ymm2, cs:dword_602C | |
| vmovups ymmword ptr [r14], ymm1 | |
| vmulps ymm1, ymm1, ymm2 | |
| vbroadcastss ymm2, cs:dword_6020 | |
| vfmadd132ps ymm0, ymm1, ymm2 | |
| vmovss xmm2, dword ptr [rbp-380h] | |
| vaddss xmm2, xmm2, xmm0 | |
| vshufps xmm3, xmm0, xmm0, 55h ; 'U' | |
| vshufps xmm1, xmm0, xmm0, 0FFh | |
| vaddss xmm3, xmm3, xmm2 | |
| vunpckhps xmm2, xmm0, xmm0 | |
| vextractf128 xmm0, ymm0, 1 | |
| vaddss xmm2, xmm2, xmm3 | |
| vaddss xmm1, xmm1, xmm2 | |
| vshufps xmm2, xmm0, xmm0, 55h ; 'U' | |
| vaddss xmm1, xmm0, xmm1 | |
| vaddss xmm2, xmm2, xmm1 | |
| vunpckhps xmm1, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm2 | |
| vaddss xmm3, xmm1, xmm0 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| if ( (v881 & 7) != 0 ) | |
| { | |
| v881 -= v456; | |
| if ( v455 - v456 - 1 > 2 ) | |
| goto LABEL_107; | |
| goto LABEL_108; | |
| } | |
| } | |
| } | |
| else | |
| { | |
| LABEL_187: | |
| __asm | |
| { | |
| vmovss xmm2, dword ptr [r15+rax*4] | |
| vmovss xmm4, dword ptr [r8+rax*4] | |
| } | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rsi+rax*4] | |
| vmovss xmm0, dword ptr [rbx+rax*4] | |
| } | |
| _R11 = v899; | |
| __asm | |
| { | |
| vmulss xmm1, xmm2, xmm3 | |
| vmulss xmm2, xmm2, xmm4 | |
| vfmsub231ss xmm1, xmm0, xmm4 | |
| vfmadd132ss xmm0, xmm2, xmm3 | |
| vmovss xmm2, cs:dword_602C | |
| vmovss dword ptr [rdi+rax*4], xmm1 | |
| vmovss dword ptr [r11+rax*4], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( _RAX + 1 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rsi+rdx+4] | |
| vmovss xmm0, dword ptr [rbx+rdx+4] | |
| } | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [r15+rdx+4] | |
| vmovss xmm5, dword ptr [r8+rdx+4] | |
| vmulss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [rdi+rdx+4], xmm1 | |
| vmovss dword ptr [r11+rdx+4], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, xmm6 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( _RAX + 2 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rsi+rdx+8] | |
| vmovss xmm5, dword ptr [r15+rdx+8] | |
| } | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rbx+rdx+8] | |
| vmovss xmm3, dword ptr [r8+rdx+8] | |
| vmulss xmm1, xmm0, xmm5 | |
| vfmsub231ss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vmovss dword ptr [rdi+rdx+8], xmm1 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [r11+rdx+8], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, xmm6 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( _RAX + 3 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rsi+rdx+0Ch] | |
| vmovss xmm0, dword ptr [rbx+rdx+0Ch] | |
| } | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [r15+rdx+0Ch] | |
| vmovss xmm5, dword ptr [r8+rdx+0Ch] | |
| vmulss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [rdi+rdx+0Ch], xmm1 | |
| vmovss dword ptr [r11+rdx+0Ch], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm7, xmm1, xmm6 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( _RAX + 4 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rsi+rdx+10h] | |
| vmovss xmm0, dword ptr [rbx+rdx+10h] | |
| } | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [r15+rdx+10h] | |
| vmovss xmm5, dword ptr [r8+rdx+10h] | |
| vmulss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [rdi+rdx+10h], xmm1 | |
| vmovss dword ptr [r11+rdx+10h], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm7, xmm1, xmm7 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( _RAX + 5 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rsi+rdx+14h] | |
| vmovss xmm0, dword ptr [rbx+rdx+14h] | |
| } | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [r15+rdx+14h] | |
| vmovss xmm5, dword ptr [r8+rdx+14h] | |
| vmulss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [rdi+rdx+14h], xmm1 | |
| vmovss dword ptr [r11+rdx+14h], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm6, xmm1, xmm7 | |
| vmovss dword ptr [rbp-380h], xmm6 | |
| } | |
| if ( _RAX + 6 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm4, dword ptr [rsi+rdx+18h] | |
| vmovss xmm0, dword ptr [rbx+rdx+18h] | |
| } | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [r15+rdx+18h] | |
| vmovss xmm5, dword ptr [r8+rdx+18h] | |
| vmulss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vfmsub231ss xmm1, xmm0, xmm5 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [rdi+rdx+18h], xmm1 | |
| vmovss dword ptr [r11+rdx+18h], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm7, xmm1, xmm6 | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( _RAX + 7 < v341 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm5, dword ptr [r15+rdx+1Ch] | |
| vmovss xmm0, dword ptr [rsi+rdx+1Ch] | |
| vmovss xmm4, dword ptr [rbx+rdx+1Ch] | |
| vmovss xmm3, dword ptr [r8+rdx+1Ch] | |
| vmulss xmm1, xmm0, xmm5 | |
| vfmsub231ss xmm1, xmm3, xmm4 | |
| vmulss xmm3, xmm3, xmm5 | |
| vmovss dword ptr [rdi+rdx+1Ch], xmm1 | |
| vfmadd132ss xmm0, xmm3, xmm4 | |
| vmovss dword ptr [r11+rdx+1Ch], xmm0 | |
| vmulss xmm0, xmm0, xmm2 | |
| vfmadd132ss xmm1, xmm0, cs:dword_6020 | |
| vaddss xmm3, xmm1, xmm7 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| __asm { vzeroupper } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| v910 = 0; | |
| v911 = 0; | |
| std::string::basic_string<std::allocator<char>>(v948, "complex_fir_scalar"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v910); | |
| std::string::_M_dispose(v948); | |
| _R12 = v891; | |
| v520 = v923 - v891; | |
| v521 = (v923 - v891) >> 2; | |
| std::vector<float>::resize(v944, v341); | |
| std::vector<float>::resize(v945, v341); | |
| if ( v900 ) | |
| { | |
| _R8 = v944[0]; | |
| LODWORD(v900) = 0; | |
| _RCX = 0; | |
| _RDI = v945[0]; | |
| __asm | |
| { | |
| vmovss xmm4, cs:dword_6030 | |
| vmovss xmm3, cs:dword_6034 | |
| } | |
| do | |
| { | |
| if ( v520 ) | |
| { | |
| __asm { vxorps xmm1, xmm1, xmm1 } | |
| v526 = 4 * _RCX; | |
| _RAX = 0; | |
| __asm { vmovaps xmm2, xmm1 } | |
| do | |
| { | |
| __asm { vmovss xmm0, dword ptr [r12+rax*4] } | |
| ++_RAX; | |
| __asm | |
| { | |
| vfmadd231ss xmm2, xmm0, dword ptr [rbx+rdx] | |
| vfmadd231ss xmm1, xmm0, dword ptr [rsi+rdx] | |
| } | |
| v526 -= 4; | |
| } | |
| while ( _RCX >= _RAX && _RAX < v521 ); | |
| __asm | |
| { | |
| vmulss xmm0, xmm1, xmm4 | |
| vfmadd231ss xmm0, xmm2, xmm3 | |
| } | |
| } | |
| else | |
| { | |
| __asm | |
| { | |
| vxorps xmm0, xmm0, xmm0 | |
| vmovaps xmm1, xmm0 | |
| vmovaps xmm2, xmm0 | |
| } | |
| } | |
| __asm | |
| { | |
| vaddss xmm5, xmm0, dword ptr [rbp-388h] | |
| vmovss dword ptr [r8+rcx*4], xmm2 | |
| vmovss dword ptr [rdi+rcx*4], xmm1 | |
| } | |
| ++_RCX; | |
| __asm { vmovss dword ptr [rbp-388h], xmm5 } | |
| } | |
| while ( _RCX < v341 ); | |
| } | |
| else | |
| { | |
| LODWORD(v900) = 0; | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::string::basic_string<std::allocator<char>>(v948, "complex_fir_avx"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v911); | |
| std::string::_M_dispose(v948); | |
| std::vector<float>::resize(v946, v341); | |
| std::vector<float>::resize(v947, v341); | |
| if ( v874 == _RBX ) | |
| { | |
| LODWORD(v899) = 0; | |
| } | |
| else | |
| { | |
| _RSI = v937; | |
| _R9 = v946[0]; | |
| LODWORD(v899) = 0; | |
| v533 = 1; | |
| _R8 = v947[0]; | |
| _R12 = v891; | |
| _RCX = 0; | |
| __asm | |
| { | |
| vmovss xmm4, cs:dword_6030 | |
| vmovss xmm3, cs:dword_6034 | |
| } | |
| do | |
| { | |
| if ( v520 <= 0x1C || v533 <= 7 ) | |
| { | |
| __asm { vxorps xmm2, xmm2, xmm2 } | |
| _RAX = 0; | |
| __asm { vmovaps ymm1, ymm2 } | |
| } | |
| else | |
| { | |
| __asm { vxorps xmm2, xmm2, xmm2 } | |
| _RDX = v533; | |
| _RAX = 8; | |
| __asm { vmovaps ymm1, ymm2 } | |
| while ( 1 ) | |
| { | |
| if ( v341 >= _RDX ) | |
| { | |
| __asm | |
| { | |
| vmovups ymm6, ymmword ptr [rbx+rdx*4-20h] | |
| vmovups ymm7, ymmword ptr [rsi+rdx*4-20h] | |
| vfmadd231ps ymm1, ymm6, ymmword ptr [r12+rax*4-20h] | |
| vfmadd231ps ymm2, ymm7, ymmword ptr [r12+rax*4-20h] | |
| } | |
| } | |
| if ( v521 < _RAX + 8 ) | |
| break; | |
| _RDX -= 8LL; | |
| if ( v533 < _RAX + 8 ) | |
| break; | |
| _RAX += 8LL; | |
| } | |
| } | |
| __asm | |
| { | |
| vmovaps xmm0, xmm1 | |
| vextractf128 xmm1, ymm1, 1 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovshdup xmm1, xmm0 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovhlps xmm1, xmm1, xmm0 | |
| vaddss xmm0, xmm0, xmm1 | |
| vmovaps xmm1, xmm2 | |
| vextractf128 xmm2, ymm2, 1 | |
| vaddps xmm1, xmm1, xmm2 | |
| vmovshdup xmm2, xmm1 | |
| vaddps xmm1, xmm1, xmm2 | |
| vmovhlps xmm2, xmm2, xmm1 | |
| vaddss xmm1, xmm1, xmm2 | |
| } | |
| if ( _RCX >= _RAX && _RAX < v521 ) | |
| { | |
| v557 = 4 * (_RCX - _RAX); | |
| do | |
| { | |
| __asm { vmovss xmm2, dword ptr [r12+rax*4] } | |
| ++_RAX; | |
| __asm | |
| { | |
| vfmadd231ss xmm0, xmm2, dword ptr [rbx+rdx] | |
| vfmadd231ss xmm1, xmm2, dword ptr [rsi+rdx] | |
| } | |
| v557 -= 4; | |
| } | |
| while ( _RCX >= _RAX && _RAX < v521 ); | |
| } | |
| __asm { vmovss dword ptr [r9+rcx*4], xmm0 } | |
| ++v533; | |
| __asm | |
| { | |
| vmovss dword ptr [r8+rcx*4], xmm1 | |
| vmulss xmm1, xmm1, xmm4 | |
| } | |
| ++_RCX; | |
| __asm | |
| { | |
| vfmadd132ss xmm0, xmm1, xmm3 | |
| vaddss xmm6, xmm0, dword ptr [rbp-390h] | |
| vmovss dword ptr [rbp-390h], xmm6 | |
| } | |
| } | |
| while ( _RCX < v341 ); | |
| __asm { vzeroupper } | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Complex mul scalar: checksum=", | |
| 29, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v559); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-398h] } | |
| v561 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v561, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v562); | |
| __asm { vmovsd xmm0, qword ptr [rbp-340h] } | |
| v564 = std::ostream::_M_insert<double>(v561, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v564, " ms\n"); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Complex mul AVX : checksum=", | |
| 29, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v565); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-380h] } | |
| v567 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v567, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v568); | |
| __asm { vmovsd xmm0, qword ptr [rbp-338h] } | |
| v570 = std::ostream::_M_insert<double>(v567, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v570, " ms\n"); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "FIR scalar : checksum=", | |
| 29, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v571); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-388h] } | |
| v573 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v573, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v574); | |
| __asm { vmovsd xmm0, qword ptr [rbp-330h] } | |
| v576 = std::ostream::_M_insert<double>(v573, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v576, " ms\n"); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "FIR AVX : checksum=", | |
| 29, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v577); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-390h] } | |
| v579 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v579, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v580); | |
| __asm { vmovsd xmm0, qword ptr [rbp-328h] } | |
| v582 = std::ostream::_M_insert<double>(v579, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v582, " ms\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Delta cmul checksum (AVX - scalar): ", | |
| 36, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v583); | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rbp-380h] | |
| vsubss xmm0, xmm3, dword ptr [rbp-398h] | |
| } | |
| __asm { vcvtss2sd xmm0, xmm0, xmm0 } | |
| v587 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v587, "\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Delta FIR checksum (AVX - scalar): ", | |
| 36, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v588); | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rbp-390h] | |
| vsubss xmm0, xmm3, dword ptr [rbp-388h] | |
| } | |
| __asm { vcvtss2sd xmm0, xmm0, xmm0 } | |
| v592 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v592, "\n"); | |
| std::operator<<<std::char_traits<char>>(&std::cout, "--------------------------------------------------------\n\n"); | |
| std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 4: Soft clip / limiter on FIR output ===\n"); | |
| v912 = 0; | |
| v913 = 0; | |
| std::vector<float>::vector(&v924, v944); | |
| std::vector<float>::vector(&v926, &v924); | |
| std::string::basic_string<std::allocator<char>>(v948, "soft_clip_scalar"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v912); | |
| std::string::_M_dispose(v948); | |
| v594 = v924; | |
| v595 = v925 - v924; | |
| v596 = (v925 - v924) >> 2; | |
| if ( v925 == v924 ) | |
| { | |
| LODWORD(v900) = 0; | |
| } | |
| else | |
| { | |
| v597 = 1; | |
| if ( v595 ) | |
| v597 = (v925 - v924) >> 2; | |
| v598 = v597; | |
| if ( v595 <= 0x1C ) | |
| { | |
| LODWORD(v900) = 0; | |
| v623 = 0; | |
| goto LABEL_144; | |
| } | |
| _RAX = v924; | |
| __asm { vpcmpeqd ymm4, ymm4, ymm4 } | |
| LODWORD(v900) = 0; | |
| __asm { vbroadcastss ymm3, cs:dword_6010 } | |
| __asm { vbroadcastss ymm2, cs:dword_600C } | |
| v603 = v924 + 32 * (v598 >> 3); | |
| do | |
| { | |
| __asm { vmovups ymm0, ymmword ptr [rax] } | |
| _RAX += 32; | |
| __asm | |
| { | |
| vcmpltps ymm5, ymm2, ymm0 | |
| vcmpltps ymm1, ymm0, ymm3 | |
| vpor ymm1, ymm5, ymm1 | |
| vpxor ymm1, ymm1, ymm4 | |
| vblendvps ymm0, ymm3, ymm0, ymm1 | |
| vblendvps ymm0, ymm0, ymm2, ymm5 | |
| vmovss xmm5, dword ptr [rbp-388h] | |
| vshufps xmm6, xmm0, xmm0, 55h ; 'U' | |
| vshufps xmm1, xmm0, xmm0, 0FFh | |
| vmovups ymmword ptr [rax-20h], ymm0 | |
| vaddss xmm5, xmm5, xmm0 | |
| vaddss xmm6, xmm6, xmm5 | |
| vunpckhps xmm5, xmm0, xmm0 | |
| vextractf128 xmm0, ymm0, 1 | |
| vaddss xmm5, xmm5, xmm6 | |
| vaddss xmm1, xmm1, xmm5 | |
| vshufps xmm5, xmm0, xmm0, 55h ; 'U' | |
| vaddss xmm1, xmm0, xmm1 | |
| vaddss xmm5, xmm5, xmm1 | |
| vunpckhps xmm1, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm5 | |
| vaddss xmm5, xmm1, xmm0 | |
| vmovss dword ptr [rbp-388h], xmm5 | |
| } | |
| } | |
| while ( _RAX != v603 ); | |
| v623 = v598 & 0xFFFFFFFFFFFFFFF8LL; | |
| if ( (v598 & 7) != 0 ) | |
| { | |
| __asm { vzeroupper } | |
| LABEL_144: | |
| v624 = v598 - v623; | |
| v625 = v598 - v623 - 1; | |
| v626 = v625 <= 2; | |
| if ( v625 <= 2 ) | |
| goto LABEL_230; | |
| _RDX = v594 + 4 * v623; | |
| __asm | |
| { | |
| vpcmpeqd xmm5, xmm5, xmm5 | |
| vbroadcastss xmm3, cs:dword_600C | |
| vbroadcastss xmm0, cs:dword_6010 | |
| vmovups xmm2, xmmword ptr [rdx] | |
| vcmpltps xmm4, xmm3, xmm2 | |
| vcmpltps xmm1, xmm2, xmm0 | |
| vpor xmm1, xmm4, xmm1 | |
| vpxor xmm1, xmm1, xmm5 | |
| vblendvps xmm0, xmm0, xmm2, xmm1 | |
| vmovss xmm1, dword ptr [rbp-388h] | |
| vblendvps xmm0, xmm0, xmm3, xmm4 | |
| vaddss xmm1, xmm1, xmm0 | |
| vshufps xmm2, xmm0, xmm0, 55h ; 'U' | |
| vmovups xmmword ptr [rdx], xmm0 | |
| } | |
| v623 += v624 & 0xFFFFFFFFFFFFFFFCLL; | |
| __asm | |
| { | |
| vaddss xmm2, xmm2, xmm1 | |
| vunpckhps xmm1, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm2 | |
| vaddss xmm3, xmm1, xmm0 | |
| vmovss dword ptr [rbp-388h], xmm3 | |
| } | |
| v626 = (v624 & 3) == 0; | |
| if ( (v624 & 3) != 0 ) | |
| { | |
| LABEL_230: | |
| v642 = 4 * v623; | |
| _RDX = v594 + 4 * v623; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rdx] | |
| vcomiss xmm0, cs:dword_600C | |
| } | |
| if ( v626 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm1, cs:dword_6010 | |
| vmaxss xmm0, xmm1, xmm0 | |
| } | |
| } | |
| else | |
| { | |
| __asm { vmovss xmm0, cs:dword_600C } | |
| } | |
| __asm | |
| { | |
| vaddss xmm3, xmm0, dword ptr [rbp-388h] | |
| vmovss dword ptr [rdx], xmm0 | |
| } | |
| __asm { vmovss dword ptr [rbp-388h], xmm3 } | |
| if ( v623 + 1 < v596 ) | |
| { | |
| _RDX = v594 + v642 + 4; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rdx] | |
| vcomiss xmm0, cs:dword_600C | |
| } | |
| if ( v623 + 1 > v596 ) | |
| { | |
| __asm { vmovss xmm0, cs:dword_600C } | |
| } | |
| else | |
| { | |
| __asm | |
| { | |
| vmovss xmm1, cs:dword_6010 | |
| vmaxss xmm0, xmm1, xmm0 | |
| } | |
| } | |
| __asm { vaddss xmm3, xmm0, dword ptr [rbp-388h] } | |
| v647 = v623 + 2; | |
| __asm | |
| { | |
| vmovss dword ptr [rdx], xmm0 | |
| vmovss dword ptr [rbp-388h], xmm3 | |
| } | |
| if ( v647 < v596 ) | |
| { | |
| _RAX = v594 + v642 + 8; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax] | |
| vcomiss xmm0, cs:dword_600C | |
| } | |
| if ( v647 <= v596 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm1, cs:dword_6010 | |
| vmaxss xmm0, xmm1, xmm0 | |
| } | |
| } | |
| else | |
| { | |
| __asm { vmovss xmm0, cs:dword_600C } | |
| } | |
| __asm | |
| { | |
| vaddss xmm3, xmm0, dword ptr [rbp-388h] | |
| vmovss dword ptr [rax], xmm0 | |
| vmovss dword ptr [rbp-388h], xmm3 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| else | |
| { | |
| __asm { vzeroupper } | |
| } | |
| } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| std::string::basic_string<std::allocator<char>>(v948, "soft_clip_avx"); | |
| ScopedTimer::ScopedTimer(v949, v948, &v913); | |
| std::string::_M_dispose(v948); | |
| _RCX = v926; | |
| v652 = (v927 - v926) >> 2; | |
| if ( (unsigned __int64)(v927 - v926) <= 0x1C ) | |
| { | |
| LODWORD(v901) = 0; | |
| _RAX = 0; | |
| } | |
| else | |
| { | |
| LODWORD(v901) = 0; | |
| _RAX = 8; | |
| __asm | |
| { | |
| vbroadcastss ymm3, cs:dword_6010 | |
| vbroadcastss ymm2, cs:dword_600C | |
| } | |
| while ( 1 ) | |
| { | |
| __asm { vmovups ymm4, ymmword ptr [rcx+rax*4-20h] } | |
| __asm | |
| { | |
| vminps ymm1, ymm4, ymm2 | |
| vmaxps ymm1, ymm1, ymm3 | |
| vmovaps xmm0, xmm1 | |
| vmovups ymmword ptr [rcx+rax*4-20h], ymm1 | |
| vextractf128 xmm1, ymm1, 1 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovshdup xmm1, xmm0 | |
| vaddps xmm0, xmm0, xmm1 | |
| vmovhlps xmm1, xmm1, xmm0 | |
| vaddss xmm0, xmm0, xmm1 | |
| vaddss xmm7, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rbp-380h], xmm7 | |
| } | |
| if ( v652 < _RAX + 8 ) | |
| break; | |
| _RAX += 8LL; | |
| } | |
| } | |
| if ( _RAX < v652 ) | |
| { | |
| v665 = v652 - _RAX; | |
| if ( v652 - _RAX - 1 <= 6 ) | |
| { | |
| v685 = _RAX; | |
| v681 = 0; | |
| goto LABEL_162; | |
| } | |
| _RDI = _RCX + 4 * _RAX; | |
| __asm | |
| { | |
| vpcmpeqd ymm5, ymm5, ymm5 | |
| vbroadcastss ymm3, cs:dword_600C | |
| vbroadcastss ymm0, cs:dword_6010 | |
| vmovups ymm2, ymmword ptr [rdi] | |
| vcmpltps ymm4, ymm3, ymm2 | |
| vcmpltps ymm1, ymm2, ymm0 | |
| vpor ymm1, ymm4, ymm1 | |
| vpxor ymm1, ymm1, ymm5 | |
| vblendvps ymm0, ymm0, ymm2, ymm1 | |
| vmovss xmm2, dword ptr [rbp-380h] | |
| vblendvps ymm0, ymm0, ymm3, ymm4 | |
| vaddss xmm2, xmm2, xmm0 | |
| vshufps xmm3, xmm0, xmm0, 55h ; 'U' | |
| vshufps xmm1, xmm0, xmm0, 0FFh | |
| vmovups ymmword ptr [rdi], ymm0 | |
| } | |
| v681 = v665 & 0xFFFFFFFFFFFFFFF8LL; | |
| __asm | |
| { | |
| vaddss xmm3, xmm3, xmm2 | |
| vunpckhps xmm2, xmm0, xmm0 | |
| vextractf128 xmm0, ymm0, 1 | |
| } | |
| v685 = _RAX + (v665 & 0xFFFFFFFFFFFFFFF8LL); | |
| __asm | |
| { | |
| vaddss xmm2, xmm2, xmm3 | |
| vaddss xmm1, xmm1, xmm2 | |
| vshufps xmm2, xmm0, xmm0, 55h ; 'U' | |
| vaddss xmm1, xmm0, xmm1 | |
| vaddss xmm2, xmm2, xmm1 | |
| vunpckhps xmm1, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm2 | |
| vaddss xmm3, xmm1, xmm0 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| if ( (v665 & 7) != 0 ) | |
| { | |
| LABEL_162: | |
| v691 = v665 - v681; | |
| v692 = v691 - 1 <= 2; | |
| if ( v691 - 1 <= 2 ) | |
| goto LABEL_231; | |
| __asm { vbroadcastss xmm3, cs:dword_600C } | |
| __asm | |
| { | |
| vpcmpeqd xmm5, xmm5, xmm5 | |
| vbroadcastss xmm0, cs:dword_6010 | |
| } | |
| _RDI = _RCX + 4 * (v681 + _RAX); | |
| __asm { vmovups xmm2, xmmword ptr [rdi] } | |
| v685 += v691 & 0xFFFFFFFFFFFFFFFCLL; | |
| v698 = v691 & 3; | |
| v692 = v698 == 0; | |
| __asm | |
| { | |
| vcmpltps xmm4, xmm3, xmm2 | |
| vcmpltps xmm1, xmm2, xmm0 | |
| vpor xmm1, xmm4, xmm1 | |
| vpxor xmm1, xmm1, xmm5 | |
| vblendvps xmm0, xmm0, xmm2, xmm1 | |
| vmovss xmm1, dword ptr [rbp-380h] | |
| vblendvps xmm0, xmm0, xmm3, xmm4 | |
| vaddss xmm1, xmm1, xmm0 | |
| vshufps xmm2, xmm0, xmm0, 55h ; 'U' | |
| vmovups xmmword ptr [rdi], xmm0 | |
| vaddss xmm2, xmm2, xmm1 | |
| vunpckhps xmm1, xmm0, xmm0 | |
| vshufps xmm0, xmm0, xmm0, 0FFh | |
| vaddss xmm1, xmm1, xmm2 | |
| vaddss xmm3, xmm1, xmm0 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| if ( v698 ) | |
| { | |
| LABEL_231: | |
| v709 = 4 * v685; | |
| _RAX = _RCX + 4 * v685; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax] | |
| vcomiss xmm0, cs:dword_600C | |
| } | |
| if ( v692 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm1, cs:dword_6010 | |
| vmaxss xmm0, xmm1, xmm0 | |
| } | |
| } | |
| else | |
| { | |
| __asm { vmovss xmm0, cs:dword_600C } | |
| } | |
| __asm | |
| { | |
| vaddss xmm3, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rax], xmm0 | |
| } | |
| __asm { vmovss dword ptr [rbp-380h], xmm3 } | |
| if ( v685 + 1 < v652 ) | |
| { | |
| _RAX = _RCX + v709 + 4; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax] | |
| vcomiss xmm0, cs:dword_600C | |
| } | |
| if ( v685 + 1 > v652 ) | |
| { | |
| __asm { vmovss xmm0, cs:dword_600C } | |
| } | |
| else | |
| { | |
| __asm | |
| { | |
| vmovss xmm1, cs:dword_6010 | |
| vmaxss xmm0, xmm1, xmm0 | |
| } | |
| } | |
| __asm | |
| { | |
| vaddss xmm3, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rax], xmm0 | |
| } | |
| __asm { vmovss dword ptr [rbp-380h], xmm3 } | |
| if ( v685 + 2 < v652 ) | |
| { | |
| _RAX = _RCX + v709 + 8; | |
| __asm | |
| { | |
| vmovss xmm0, dword ptr [rax] | |
| vcomiss xmm0, cs:dword_600C | |
| } | |
| if ( v685 + 2 <= v652 ) | |
| { | |
| __asm | |
| { | |
| vmovss xmm1, cs:dword_6010 | |
| vmaxss xmm0, xmm1, xmm0 | |
| } | |
| } | |
| else | |
| { | |
| __asm { vmovss xmm0, cs:dword_600C } | |
| } | |
| __asm | |
| { | |
| vaddss xmm3, xmm0, dword ptr [rbp-380h] | |
| vmovss dword ptr [rax], xmm0 | |
| vmovss dword ptr [rbp-380h], xmm3 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| __asm { vzeroupper } | |
| ScopedTimer::~ScopedTimer((ScopedTimer *)v949); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Soft clip scalar: checksum=", | |
| 27, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v716); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-388h] } | |
| v718 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v718, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v719); | |
| __asm { vmovsd xmm0, qword ptr [rbp-320h] } | |
| v721 = std::ostream::_M_insert<double>(v718, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v721, " ms\n"); | |
| *(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Soft clip AVX : checksum=", | |
| 27, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v722); | |
| __asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-380h] } | |
| v724 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| v724, | |
| " time=", | |
| 7, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v725); | |
| __asm { vmovsd xmm0, qword ptr [rbp-318h] } | |
| v727 = std::ostream::_M_insert<double>(v724, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v727, " ms\n"); | |
| std::__ostream_insert<char,std::char_traits<char>>( | |
| &std::cout, | |
| "Delta clip checksum (AVX - scalar): ", | |
| 36, | |
| *(double *)&_XMM0, | |
| *(double *)&_XMM1, | |
| *(double *)&_XMM2, | |
| *(double *)&_XMM3, | |
| v728); | |
| __asm | |
| { | |
| vmovss xmm3, dword ptr [rbp-380h] | |
| vsubss xmm0, xmm3, dword ptr [rbp-388h] | |
| } | |
| __asm { vcvtss2sd xmm0, xmm0, xmm0 } | |
| v732 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0); | |
| std::operator<<<std::char_traits<char>>(v732, "\n"); | |
| std::operator<<<std::char_traits<char>>(&std::cout, "\nDone.\n"); | |
| std::vector<float>::~vector(&v926); | |
| std::vector<float>::~vector(&v924); | |
| std::vector<float>::~vector(&v922); | |
| std::vector<float>::~vector(v947); | |
| std::vector<float>::~vector(v946); | |
| std::vector<float>::~vector(v945); | |
| std::vector<float>::~vector(v944); | |
| std::vector<float>::~vector(v943); | |
| std::vector<float>::~vector(v942); | |
| std::vector<float>::~vector(v941); | |
| std::vector<float>::~vector(v940); | |
| std::vector<float>::~vector(&v939); | |
| std::vector<float>::~vector(v938); | |
| std::vector<float>::~vector(&v937); | |
| std::vector<float>::~vector(&v935); | |
| std::vector<float>::~vector(v934); | |
| std::vector<float>::~vector(v932); | |
| std::vector<float>::~vector(&v929); | |
| std::vector<float>::~vector(v921); | |
| std::vector<float>::~vector(&v919); | |
| std::vector<float>::~vector(v918); | |
| std::vector<float>::~vector(&v916); | |
| std::vector<float>::~vector(&v914); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| extern unsigned long long g_400000; | |
| extern void g_408040; | |
| int main() | |
| { | |
| void* v0; // [bp-0x490] | |
| unsigned long v1; // [bp-0x488] | |
| char *v2; // [bp-0x460] | |
| char *v3; // [bp-0x458] | |
| unsigned int v4; // [bp-0x450] | |
| unsigned int v5; // [bp-0x44c] | |
| char *v6; // [bp-0x448] | |
| char *v7; // [bp-0x440] | |
| unsigned int v8; // [bp-0x438], Other Possible Types: unsigned long | |
| void* v9; // [sp-0x430], Other Possible Types: unsigned long long | |
| unsigned int v10; // [bp-0x430] | |
| unsigned long v11; // [sp-0x428], Other Possible Types: unsigned long long | |
| unsigned long long v12; // [sp-0x420] | |
| unsigned int v13; // [sp-0x418], Other Possible Types: unsigned long long | |
| unsigned int v14; // [bp-0x418], Other Possible Types: unsigned long | |
| unsigned long v15; // [sp-0x410], Other Possible Types: unsigned long long | |
| uint128_t *v16; // [bp-0x408], Other Possible Types: unsigned int, unsigned long long | |
| char *v17; // [sp-0x400], Other Possible Types: unsigned long | |
| char *v18; // [bp-0x3f8] | |
| char *v19; // [bp-0x3f0], Other Possible Types: unsigned long long | |
| unsigned int v20; // [bp-0x3f0] | |
| char *v21; // [bp-0x3e8], Other Possible Types: char, unsigned int | |
| char *v22; // [bp-0x3e0], Other Possible Types: unsigned int | |
| unsigned long long v23; // [bp-0x3e0] | |
| char *v24; // [bp-0x3d8] | |
| unsigned long v25; // [bp-0x3d0] | |
| unsigned int v26; // [bp-0x3c8] | |
| void* v27; // [bp-0x3c0], Other Possible Types: unsigned int *, unsigned long | |
| uint256_t *v28; // [sp-0x3b8], Other Possible Types: void*, unsigned long | |
| void* v29; // [bp-0x3b0], Other Possible Types: uint256_t *, unsigned int | |
| unsigned int v30; // [bp-0x3a8], Other Possible Types: unsigned long | |
| void* v31; // [bp-0x3a0] | |
| unsigned int v32; // [bp-0x3a0] | |
| unsigned long long v33; // [bp-0x398], Other Possible Types: unsigned int | |
| unsigned int v34; // [bp-0x390] | |
| int v35; // [sp-0x390], Other Possible Types: unsigned long long | |
| unsigned int v36; // [bp-0x390] | |
| unsigned int v37; // [bp-0x390] | |
| void* v38; // [bp-0x380] | |
| void* v39; // [bp-0x378] | |
| void* v40; // [bp-0x370] | |
| void* v41; // [bp-0x368] | |
| void* v42; // [bp-0x360] | |
| void* v43; // [bp-0x358] | |
| void* v44; // [bp-0x350] | |
| void* v45; // [bp-0x348] | |
| void* v46; // [bp-0x340] | |
| void* v47; // [bp-0x338] | |
| void* v48; // [bp-0x330] | |
| void* v49; // [bp-0x328] | |
| unsigned int *v50; // [bp-0x320], Other Possible Types: char | |
| char v51; // [bp-0x318] | |
| void* v52; // [bp-0x300], Other Possible Types: char | |
| char v53; // [bp-0x2f8] | |
| char v54; // [bp-0x2e0] | |
| unsigned int *v55; // [bp-0x2c0], Other Possible Types: char | |
| char v56; // [bp-0x2b8] | |
| char v57; // [bp-0x2a0] | |
| unsigned int *v58; // [bp-0x280], Other Possible Types: char | |
| char v59; // [bp-0x278] | |
| void* v60; // [bp-0x260], Other Possible Types: char | |
| char v61; // [bp-0x258] | |
| unsigned int *v62; // [bp-0x240], Other Possible Types: char | |
| char v63; // [bp-0x238] | |
| unsigned long long v64; // [bp-0x220] | |
| unsigned int v65; // [bp-0x21c] | |
| void* v66; // [bp-0x218], Other Possible Types: char | |
| char v67; // [bp-0x210] | |
| unsigned long long v68; // [bp-0x200] | |
| char v69; // [bp-0x1f8] | |
| unsigned long long v70; // [bp-0x1e0] | |
| char v71; // [bp-0x1d8] | |
| void* v72; // [bp-0x1c0], Other Possible Types: char | |
| char v73; // [bp-0x1b8] | |
| char v74; // [bp-0x1a8] | |
| char v75; // [bp-0x190] | |
| char v76; // [bp-0x178] | |
| char v77; // [bp-0x160] | |
| char v78; // [bp-0x148] | |
| char v79; // [bp-0x130] | |
| char v80; // [bp-0x118] | |
| char v81; // [bp-0x100] | |
| char v82; // [bp-0xe8] | |
| char v83; // [bp-0xd0] | |
| char v84; // [bp-0xb8] | |
| char v85; // [bp-0xa0] | |
| char v86; // [bp-0x80] | |
| unsigned int *v88; // rbx | |
| unsigned long long v89; // rcx | |
| unsigned int *v90; // rdx | |
| unsigned int *v91; // rdx | |
| int v92; // xmm0 | |
| int v93; // xmm1 | |
| void* v94; // rdx | |
| unsigned long long v95; // rcx | |
| int v96; // ymm3, Other Possible Types: uint256_t | |
| unsigned int *v97; // r12 | |
| unsigned long long v98; // rcx | |
| unsigned int *v99; // rdx | |
| unsigned int *v100; // rdx | |
| void* v101; // rax | |
| unsigned long v102; // rdx | |
| uint256_t v104; // ymm0 | |
| int v105; // ymm2 | |
| uint256_t v106; // ymm2 | |
| int v107; // ymm4 | |
| uint256_t v108; // ymm4 | |
| int v109; // ymm5 | |
| uint256_t v110; // ymm5 | |
| int v111; // ymm6 | |
| uint256_t v112; // ymm6 | |
| int v113; // ymm7 | |
| uint256_t v114; // ymm7 | |
| uint256_t v116; // ymm0 | |
| void* v117; // rax | |
| uint256_t v118; // ymm0 | |
| void* v119; // rax | |
| uint256_t v120; // ymm0 | |
| uint256_t v121; // ymm0 | |
| uint256_t v122; // ymm0 | |
| uint256_t v123; // ymm0 | |
| uint256_t v124; // ymm0 | |
| uint256_t v125; // ymm0 | |
| unsigned int v126[8]; // rax | |
| unsigned long long v127; // rdx | |
| int v128; // ymm1 | |
| uint256_t v129; // ymm0 | |
| unsigned int v130[8]; // rax | |
| uint256_t v131; // ymm0 | |
| unsigned int v132[8]; // rax | |
| uint256_t v133; // ymm0 | |
| uint256_t v134; // ymm0 | |
| uint256_t v135; // ymm0 | |
| uint256_t v136; // ymm0 | |
| uint256_t v137; // ymm0 | |
| uint256_t v138; // ymm0 | |
| uint256_t v139; // ymm0 | |
| uint256_t v140; // ymm1 | |
| uint256_t v141; // ymm5 | |
| uint256_t v142; // ymm6 | |
| uint256_t v143; // ymm7 | |
| uint256_t v144; // ymm4 | |
| void* v145; // rax | |
| uint256_t v146; // ymm2 | |
| uint256_t v149; // ymm0 | |
| uint256_t v150; // ymm1 | |
| uint256_t v151; // ymm1 | |
| unsigned long long v152; // r12 | |
| uint256_t v153; // ymm2 | |
| void* v154; // rax | |
| int v155; // ymm3 | |
| int v156; // ymm4 | |
| uint256_t v157; // ymm0 | |
| uint256_t v158; // ymm4 | |
| uint256_t v159; // ymm4 | |
| uint256_t v160; // ymm0 | |
| uint256_t v161; // ymm4 | |
| uint256_t v162; // ymm0 | |
| uint256_t v163; // ymm4 | |
| uint256_t v164; // ymm3 | |
| uint256_t v165; // ymm0 | |
| uint256_t v166; // ymm1 | |
| uint256_t v167; // ymm0 | |
| uint256_t v168; // ymm1 | |
| uint256_t v169; // ymm0 | |
| uint256_t v170; // ymm2 | |
| uint256_t v171; // ymm1 | |
| uint256_t v172; // ymm0 | |
| uint256_t v173; // ymm1 | |
| uint256_t v174; // ymm1 | |
| uint256_t v175; // ymm0 | |
| uint256_t v176; // ymm1 | |
| uint256_t v177; // ymm0 | |
| uint256_t v178; // ymm1 | |
| uint256_t v179; // ymm3 | |
| void* v180; // r15 | |
| unsigned int v181; // rcx | |
| uint256_t v182; // ymm3 | |
| unsigned long v183; // xmm0lq | |
| unsigned long v184; // xmm0hq | |
| void* v185; // rax | |
| uint256_t v187; // ymm3 | |
| unsigned long v188; // xmm0lq | |
| unsigned long v189; // xmm0hq | |
| void* v190; // rax | |
| uint256_t v191; // ymm0 | |
| uint256_t v193; // ymm3 | |
| unsigned long v194; // xmm0lq | |
| unsigned long v195; // xmm0hq | |
| void* v196; // rax | |
| uint256_t v198; // ymm3 | |
| unsigned long v199; // xmm0lq | |
| unsigned long v200; // xmm0hq | |
| void* v201; // rax | |
| uint256_t v202; // ymm0 | |
| void* v204; // r13 | |
| unsigned long long v205; // rcx | |
| int v206; // ymm1 | |
| void* v207; // rdx | |
| void* v208; // rdx | |
| uint256_t v209; // ymm0 | |
| unsigned long v210; // rsi | |
| unsigned long long v211; // r14 | |
| unsigned long v212; // rbx | |
| void* v213; // r12 | |
| unsigned long v214; // rsi | |
| unsigned int v215; // ecx | |
| unsigned long v216; // rax | |
| int v217; // ymm0, Other Possible Types: uint256_t | |
| unsigned int *v218; // rcx | |
| unsigned long long v219; // r9 | |
| unsigned int v220; // esi | |
| unsigned int *v221; // rax | |
| unsigned long v223; // r8 | |
| uint256_t v224; // ymm1 | |
| unsigned int v225; // r11d | |
| unsigned int v226; // r11d | |
| void* v227; // rax | |
| int v228; // ymm2 | |
| unsigned int *v229; // rsi | |
| void* v230; // rdx | |
| unsigned long long v231; // r12 | |
| unsigned long v232; // r14 | |
| int v233; // ymm4, Other Possible Types: uint256_t | |
| uint256_t v234; // ymm5 | |
| int v235; // ymm6, Other Possible Types: uint256_t | |
| unsigned long v236; // r8 | |
| unsigned long v237; // r9 | |
| void* v238; // rdi | |
| int v239; // ymm0 | |
| uint256_t v240; // ymm4 | |
| uint256_t v241; // ymm6 | |
| uint256_t v242; // ymm5 | |
| uint256_t v243; // ymm4 | |
| uint256_t v244; // ymm0 | |
| uint256_t v245; // ymm4 | |
| uint256_t v246; // ymm4 | |
| uint256_t v247; // ymm5 | |
| uint256_t v248; // ymm4 | |
| uint256_t v249; // ymm4 | |
| uint256_t v250; // ymm5 | |
| unsigned int v251; // r9d | |
| unsigned long v252; // rdi | |
| unsigned int v253; // r10d | |
| unsigned int v254; // edi | |
| unsigned long long v255; // r8 | |
| unsigned long v256; // r11 | |
| uint256_t v257; // ymm0 | |
| uint256_t v258; // ymm0 | |
| uint256_t v259; // ymm0 | |
| uint256_t v260; // ymm0 | |
| uint256_t v261; // ymm0 | |
| uint256_t v262; // ymm4 | |
| uint256_t v263; // ymm5 | |
| uint256_t v264; // ymm4 | |
| uint256_t v265; // ymm4 | |
| unsigned long v266; // r8 | |
| unsigned long v267; // r9 | |
| unsigned int *v268; // rbx | |
| uint256_t v269; // ymm0 | |
| unsigned int *v270; // r11 | |
| uint256_t v271; // ymm0 | |
| uint256_t v272; // ymm0 | |
| uint256_t v273; // ymm0 | |
| unsigned int v274; // edi | |
| uint256_t v275; // ymm0 | |
| uint256_t v276; // ymm0 | |
| uint256_t v277; // ymm0 | |
| uint256_t v278; // ymm0 | |
| unsigned long v279; // rdi | |
| uint256_t v280; // ymm0 | |
| uint256_t v281; // ymm0 | |
| uint256_t v282; // ymm0 | |
| uint256_t v283; // ymm0 | |
| unsigned long long v284; // rdi | |
| uint256_t v285; // ymm0 | |
| uint256_t v286; // ymm0 | |
| uint256_t v287; // ymm0 | |
| uint256_t v288; // ymm0 | |
| unsigned long v289; // rsi | |
| unsigned long long v290; // r15 | |
| unsigned int v291; // ecx | |
| unsigned long v292; // rax | |
| int v293; // ymm0, Other Possible Types: uint256_t | |
| unsigned int v294; // ecx | |
| unsigned int *v295; // rax | |
| uint256_t v296; // ymm3 | |
| unsigned long v297; // rbx | |
| int v298; // ymm2 | |
| uint256_t v299; // ymm4 | |
| uint256_t v300; // ymm5 | |
| uint256_t v301; // ymm7 | |
| unsigned long v302; // rbx | |
| unsigned long long v303; // rdx | |
| unsigned int *v304; // rax | |
| unsigned long v305; // rbx | |
| unsigned int *v306; // rcx | |
| unsigned int *v307; // rdx | |
| unsigned int *v308; // rsi | |
| unsigned long long v309; // rdi | |
| unsigned long v310; // r8 | |
| int v311; // ymm0 | |
| uint256_t v312; // ymm1 | |
| uint256_t v313; // ymm0 | |
| uint256_t v314; // ymm0 | |
| uint256_t v315; // ymm1 | |
| uint256_t v316; // ymm0 | |
| unsigned long long v317; // r11 | |
| unsigned long long v318; // rdi | |
| unsigned int v319; // r10d | |
| unsigned long long v320; // r9 | |
| uint128_t *v321; // r8 | |
| unsigned int v322; // edi | |
| uint256_t v323; // ymm0 | |
| uint256_t v324; // ymm0 | |
| uint256_t v325; // ymm0 | |
| uint256_t v326; // ymm0 | |
| uint256_t v327; // ymm0 | |
| uint256_t v328; // ymm0 | |
| uint256_t v329; // ymm1 | |
| uint256_t v330; // ymm6 | |
| uint256_t v331; // ymm1 | |
| uint256_t v332; // ymm6 | |
| void* v333; // rdi | |
| unsigned int v334; // r10d | |
| unsigned int v335; // r9d | |
| int v336; // ymm1, Other Possible Types: uint256_t | |
| unsigned long long v337; // r10 | |
| unsigned long long v338; // r8 | |
| uint256_t v339; // ymm1 | |
| uint256_t v340; // ymm0 | |
| uint256_t v341; // ymm1 | |
| uint256_t v342; // ymm0 | |
| uint256_t v343; // ymm1 | |
| uint256_t v344; // ymm0 | |
| uint256_t v345; // ymm0 | |
| unsigned long long v346; // rdi | |
| uint256_t v347; // ymm0 | |
| uint256_t v348; // ymm0 | |
| uint256_t v349; // ymm0 | |
| uint256_t v350; // ymm0 | |
| unsigned int *v351; // r8 | |
| unsigned long v352; // r9 | |
| uint256_t v353; // ymm0 | |
| uint256_t v354; // ymm0 | |
| uint256_t v355; // ymm0 | |
| uint256_t v356; // ymm0 | |
| unsigned long v357; // rdi | |
| uint256_t v358; // ymm0 | |
| uint256_t v359; // ymm0 | |
| uint256_t v360; // ymm0 | |
| uint256_t v361; // ymm0 | |
| unsigned long v362; // r10 | |
| uint256_t v363; // ymm0 | |
| uint256_t v364; // ymm0 | |
| uint256_t v365; // ymm0 | |
| uint256_t v366; // ymm0 | |
| unsigned long v367; // rdi | |
| uint256_t v368; // ymm0 | |
| uint256_t v369; // ymm0 | |
| uint256_t v370; // ymm0 | |
| uint256_t v371; // ymm0 | |
| unsigned long v372; // r9 | |
| uint256_t v373; // ymm0 | |
| uint256_t v374; // ymm0 | |
| uint256_t v375; // ymm0 | |
| uint256_t v376; // ymm0 | |
| unsigned long v377; // rdi | |
| uint256_t v378; // ymm0 | |
| uint256_t v379; // ymm0 | |
| uint256_t v380; // ymm0 | |
| uint256_t v381; // ymm0 | |
| uint256_t v382; // ymm0 | |
| uint256_t v383; // ymm0 | |
| uint256_t v384; // ymm0 | |
| uint256_t v385; // ymm0 | |
| unsigned long v386; // rcx | |
| uint256_t v387; // ymm0 | |
| unsigned long v388; // xmm0hq | |
| void* v389; // rax | |
| uint256_t v391; // ymm0 | |
| unsigned long v392; // xmm0hq | |
| void* v393; // rax | |
| uint256_t v395; // ymm0 | |
| uint256_t v396; // ymm0 | |
| void* v398; // rbx | |
| uint256_t v399; // ymm1 | |
| uint256_t v400; // ymm0 | |
| uint256_t v401; // ymm0 | |
| int v402; // ymm2, Other Possible Types: uint256_t | |
| uint256_t v403; // ymm0 | |
| uint256_t v404; // ymm0 | |
| uint256_t v405; // ymm0 | |
| uint256_t v406; // ymm0 | |
| void* v407; // rbx | |
| unsigned long long v408; // r15 | |
| unsigned long long v409; // r13 | |
| void* v410; // rax | |
| void* v411; // rdx | |
| unsigned long v412; // r8 | |
| unsigned long v413; // r9 | |
| void* v414; // r8 | |
| int v415; // ymm2 | |
| int v416; // ymm4 | |
| int v417; // ymm5 | |
| uint256_t v418; // ymm0 | |
| uint256_t v419; // ymm5 | |
| uint256_t v420; // ymm4 | |
| uint256_t v421; // ymm0 | |
| uint256_t v422; // ymm5 | |
| uint256_t v423; // ymm1 | |
| uint256_t v424; // ymm0 | |
| uint256_t v425; // ymm0 | |
| uint256_t v426; // ymm4 | |
| uint256_t v427; // ymm0 | |
| uint256_t v428; // ymm0 | |
| uint256_t v429; // ymm4 | |
| uint256_t v430; // ymm0 | |
| uint256_t v431; // ymm4 | |
| unsigned long long v432; // r8 | |
| unsigned long long v433; // r9 | |
| void* v434; // r10 | |
| uint256_t v435; // ymm0 | |
| unsigned long long v436; // r9 | |
| uint256_t v437; // ymm0 | |
| uint128_t v438; // xmm3 | |
| uint256_t v439; // ymm0 | |
| uint256_t v440; // ymm0 | |
| void* v441; // r8 | |
| uint256_t v442; // ymm0 | |
| unsigned long long v443; // rax | |
| unsigned long long v444; // rcx | |
| int v445; // ymm5 | |
| unsigned long v446; // rdx | |
| uint256_t v447; // ymm0 | |
| uint256_t v448; // ymm1 | |
| uint256_t v449; // ymm0 | |
| uint256_t v450; // ymm1 | |
| void* v451; // r15 | |
| void* v452; // rdx | |
| uint128_t *v453; // r8 | |
| void* v454; // rdi | |
| unsigned long v455; // rcx | |
| unsigned long long v456; // r11 | |
| unsigned long long v457; // rdx | |
| unsigned long long v458; // rcx | |
| int v459; // ymm2 | |
| uint256_t v460; // ymm2 | |
| uint256_t v461; // ymm1 | |
| uint256_t v462; // ymm2 | |
| uint256_t v463; // ymm0 | |
| uint256_t v464; // ymm2 | |
| uint256_t v465; // ymm1 | |
| uint256_t v466; // ymm2 | |
| void* v467; // rdx | |
| uint256_t v468; // ymm2 | |
| uint256_t v469; // ymm0 | |
| unsigned long long v470; // rdx | |
| uint256_t v471; // ymm0 | |
| uint128_t v472; // xmm3 | |
| uint256_t v473; // ymm0 | |
| uint256_t v474; // ymm0 | |
| uint256_t v475; // ymm0 | |
| uint256_t v476; // ymm0 | |
| uint256_t v477; // ymm0 | |
| uint256_t v478; // ymm0 | |
| uint256_t v479; // ymm0 | |
| uint256_t v480; // ymm0 | |
| uint256_t v481; // ymm0 | |
| uint256_t v482; // ymm0 | |
| uint256_t v483; // ymm0 | |
| uint256_t v484; // ymm2 | |
| uint256_t v485; // ymm4 | |
| uint256_t v486; // ymm5 | |
| int v487; // ymm6, Other Possible Types: uint256_t | |
| unsigned long long v488; // r15 | |
| unsigned long long v489; // r14 | |
| void* v490; // rcx | |
| unsigned int *v491; // rdi | |
| void* v492; // rax | |
| unsigned long long v493; // rdi | |
| unsigned int *v494; // r8 | |
| void* v495; // rcx | |
| uint256_t v496; // ymm2 | |
| unsigned long long v497; // rax | |
| unsigned long long v498; // rdx | |
| unsigned long long v499; // rdx | |
| unsigned long v500; // r10 | |
| int v501; // ymm1 | |
| uint256_t v503; // ymm1 | |
| uint256_t v504; // ymm0 | |
| uint256_t v505; // ymm1 | |
| uint256_t v506; // ymm1 | |
| uint256_t v507; // ymm2 | |
| uint256_t v508; // ymm1 | |
| uint256_t v509; // ymm2 | |
| uint256_t v510; // ymm0 | |
| unsigned long v511; // xmm0hq | |
| void* v512; // rax | |
| uint256_t v514; // ymm0 | |
| unsigned long v515; // xmm0hq | |
| void* v516; // rax | |
| uint256_t v518; // ymm0 | |
| unsigned long v519; // xmm0hq | |
| void* v520; // rax | |
| uint256_t v522; // ymm0 | |
| unsigned long v523; // xmm0hq | |
| void* v524; // rax | |
| uint256_t v526; // ymm0 | |
| uint256_t v527; // ymm0 | |
| uint256_t v529; // ymm0 | |
| int v530; // ymm0, Other Possible Types: uint256_t | |
| void* v532; // rsi | |
| unsigned long long v533; // rcx | |
| unsigned long long v534; // rdi | |
| unsigned long v535; // rdx | |
| int v536; // ymm3 | |
| int v537; // ymm2 | |
| void* v538; // rax | |
| int v539; // ymm0 | |
| void* v540; // rax | |
| int v541; // ymm5 | |
| int v542; // ymm1 | |
| uint256_t v543; // ymm0 | |
| uint256_t v544; // ymm6 | |
| uint256_t v545; // ymm1 | |
| uint256_t v546; // ymm5 | |
| uint256_t v547; // ymm5 | |
| uint256_t v548; // ymm0 | |
| uint256_t v549; // ymm5 | |
| uint256_t v550; // ymm1 | |
| uint256_t v551; // ymm5 | |
| uint256_t v552; // ymm1 | |
| uint256_t v553; // ymm5 | |
| uint256_t v554; // ymm1 | |
| uint256_t v555; // ymm0 | |
| uint256_t v556; // ymm5 | |
| unsigned long long v557; // rcx | |
| unsigned long long v558; // rax | |
| uint128_t *v559; // rdx | |
| uint128_t v560; // xmm3 | |
| uint256_t v561; // ymm0 | |
| int v562; // xmm2 | |
| uint128_t v563; // xmm4 | |
| uint256_t v564; // ymm0 | |
| uint256_t v565; // ymm0 | |
| unsigned long v566; // rax | |
| uint256_t v567; // ymm0 | |
| uint256_t v568; // ymm0 | |
| uint256_t v569; // ymm0 | |
| unsigned int *v570; // rcx | |
| unsigned long long v571; // rax | |
| unsigned long long v572; // rsi | |
| unsigned long long v573; // rax | |
| int v574; // ymm3 | |
| int v575; // ymm2 | |
| unsigned long v576; // rdx | |
| int v577; // ymm1 | |
| uint256_t v578; // ymm1 | |
| uint256_t v579; // ymm0 | |
| uint256_t v580; // ymm1 | |
| unsigned long long v581; // rdx | |
| uint256_t *v582; // rdi | |
| int v583; // ymm3 | |
| int v584; // ymm0 | |
| int v585; // ymm2 | |
| uint256_t v586; // ymm0 | |
| unsigned long long v587; // rdi | |
| uint256_t v588; // ymm0 | |
| unsigned long long v589; // r8 | |
| unsigned long long v590; // rdx | |
| uint128_t v591; // xmm3 | |
| uint256_t v592; // ymm0 | |
| uint128_t *v593; // rdi | |
| uint128_t v594; // xmm2 | |
| uint128_t v595; // xmm4 | |
| uint256_t v596; // ymm0 | |
| uint256_t v597; // ymm0 | |
| unsigned int *v598; // rdx | |
| uint256_t v599; // ymm0 | |
| uint256_t v600; // ymm0 | |
| uint256_t v601; // ymm0 | |
| uint256_t v602; // ymm0 | |
| unsigned long v603; // xmm0hq | |
| void* v604; // rax | |
| uint256_t v606; // ymm0 | |
| unsigned long v607; // xmm0hq | |
| void* v608; // rax | |
| uint256_t v610; // ymm0 | |
| v50.allocator<float> const&) (.constprop.0)(&g_400000); | |
| v7 = &v52; | |
| v52.allocator<float> const&) (.constprop.0)(&g_400000); | |
| v6 = &v54; | |
| v54.allocator<float> const&) (.constprop.0)(&g_400000); | |
| v3 = &v55; | |
| v55.allocator<float> const&) (.constprop.0)(&g_400000); | |
| v2 = &v57; | |
| v57.allocator<float> const&) (.constprop.0)(&g_400000); | |
| v88 = v50; | |
| v89 = *((long long *)&v51); | |
| if (v89 != v88) | |
| { | |
| v90 = v88; | |
| do | |
| { | |
| v96 = v96 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v91 = v90 + 1; | |
| v91[1] = (unsigned int)(MulV((v92 & 18446744073709551615 | ((uint128_t)(v96 >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, v93 & 340282366920938463463374607431768211455)); | |
| v90 = v91; | |
| } while (v89 != v90); | |
| } | |
| v94 = v52; | |
| v95 = *((long long *)&v53); | |
| if (v95 != v94) | |
| { | |
| do | |
| { | |
| v96 = v96 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v94 += 4; | |
| *((unsigned int *)&v94[4]) = (unsigned int)(MulV((v92 & 18446744073709551615 | ((uint128_t)(v96 >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, v93 & 340282366920938463463374607431768211455)); | |
| } while (v94 != v95); | |
| } | |
| v97 = v55; | |
| v98 = *((long long *)&v56); | |
| if (v98 != v97) | |
| { | |
| v99 = v97; | |
| do | |
| { | |
| v96 = v96 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v100 = v99 + 1; | |
| v100[1] = (unsigned int)(MulV((v92 & 18446744073709551615 | ((uint128_t)(v96 >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, v93 & 340282366920938463463374607431768211455)); | |
| v99 = v100; | |
| } while (v98 != v99); | |
| } | |
| v2.allocator<float>> const&) (.isra.0)(v3); | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 1: SAXPY + cosine similarity ===\n"); | |
| v38 = 0; | |
| v39 = 0; | |
| v6.allocator<float>> const&) (.isra.0)(v7); | |
| v18 = &v85; | |
| v85.allocator<char> const&) (.constprop.0)("saxpy_scalar"); | |
| v24 = &v86; | |
| v86.ScopedTimer(&v85, &v38); | |
| v85._M_dispose(); | |
| v101 = *((long long *)&v54); | |
| v102 = 0; | |
| if (v101 - (v88 + 1) > 24) | |
| { | |
| do | |
| { | |
| *((void*)((char *)v101 + v102)) = v92; | |
| v102 += 32; | |
| } while (v102 != 0x1000000); | |
| v104 = v92 & 340282366920938463463374607431768211455; | |
| v106 = v105 & 340282366920938463463374607431768211455; | |
| v108 = v107 & 340282366920938463463374607431768211455; | |
| v110 = v109 & 340282366920938463463374607431768211455; | |
| v112 = v111 & 340282366920938463463374607431768211455; | |
| v114 = v113 & 340282366920938463463374607431768211455; | |
| } | |
| else | |
| { | |
| do | |
| { | |
| *((unsigned int *)((char *)v101 + 4 * v102)) = (unsigned int)v92; | |
| v102 += 1; | |
| } while (v102 != 0x400000); | |
| } | |
| v116 = v104 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v117 = v101; | |
| do | |
| { | |
| v118 = (v116 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v116 & 340282366920938463463374607431768211455, *((int *)v117))) & 340282366920938463463374607431768211455; | |
| v119 = v117 + 32; | |
| v120 = (v118 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v118, (int)v119[28])) & 340282366920938463463374607431768211455; | |
| v121 = (v120 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v120, (int)v119[24])) & 340282366920938463463374607431768211455; | |
| v122 = (v121 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v121, (int)v119[20])) & 340282366920938463463374607431768211455; | |
| v123 = (v122 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v122, (int)v119[16])) & 340282366920938463463374607431768211455; | |
| v124 = (v123 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v123, (int)v119[12])) & 340282366920938463463374607431768211455; | |
| v125 = (v124 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v124, (int)v119[8])) & 340282366920938463463374607431768211455; | |
| v116 = (v125 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v125, (int)v119[4])) & 340282366920938463463374607431768211455; | |
| v117 = v119; | |
| } while (v117 != v101 + 0x1000000); | |
| v6.allocator<float>> const&) (.isra.0)(v7); | |
| v18.allocator<char> const&) (.constprop.0)("saxpy_avx"); | |
| v24.ScopedTimer(v18, &v39); | |
| v18._M_dispose(); | |
| v126 = *((long long *)&v54); | |
| v127 = 8; | |
| v128 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| do | |
| { | |
| *((void*)&v126[1 + v127]) = v92; | |
| v127 += 8; | |
| } while (v127 != 4194312); | |
| v129 = v92 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v130 = v126; | |
| do | |
| { | |
| v131 = (v129 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v129 & 340282366920938463463374607431768211455, v130[0])) & 340282366920938463463374607431768211455; | |
| v132 = v130 + 1; | |
| v133 = (v131 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v131, v132[7])) & 340282366920938463463374607431768211455; | |
| v134 = (v133 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v133, v132[6])) & 340282366920938463463374607431768211455; | |
| v135 = (v134 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v134, v132[5])) & 340282366920938463463374607431768211455; | |
| v136 = (v135 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v135, v132[4])) & 340282366920938463463374607431768211455; | |
| v137 = (v136 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v136, v132[3])) & 340282366920938463463374607431768211455; | |
| v138 = (v137 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v137, v132[2])) & 340282366920938463463374607431768211455; | |
| v129 = (v138 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v138, v132[1])) & 340282366920938463463374607431768211455; | |
| v130 = v132; | |
| } while (v126 + 0x80000 != v130); | |
| v139 = v129 & 340282366920938463463374607431768211455; | |
| v140 = v128 & 340282366920938463463374607431768211455; | |
| v141 = v110 & 340282366920938463463374607431768211455; | |
| v142 = v112 & 340282366920938463463374607431768211455; | |
| v143 = v114 & 340282366920938463463374607431768211455; | |
| v40 = 0; | |
| v41 = 0; | |
| v18.allocator<char> const&) (.constprop.0)("cosine_scalar"); | |
| v24.ScopedTimer(v18, &v40); | |
| v18._M_dispose(); | |
| v144 = v108 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v145 = 0; | |
| v146 = (v106 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v144 >> 64) CONCAT (unsigned long long)v144)) & 340282366920938463463374607431768211455; | |
| do | |
| { | |
| v143 = v143 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v140 = ((v140 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((int *)((char *)v88 + 0x4 * v145))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v143 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| v139 = ((v139 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((int *)((char *)v97 + 0x4 * v145))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v143 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| v145 += 1; | |
| v146 = (v146 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | [D] unsupported_<class 'pyvex.expr.Qop'>()) & 340282366920938463463374607431768211455; | |
| v144 = (v144 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | [D] unsupported_<class 'pyvex.expr.Qop'>()) & 340282366920938463463374607431768211455; | |
| } while (v145 != 0x400000); | |
| v149 = (v139 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v146, (uint128_t)v144)) & 340282366920938463463374607431768211455; | |
| v150 = v140 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| if (((CmpF((unsigned long long)v150, (unsigned long long)v149) & 69 | (char)((CmpF((unsigned long long)v150, (unsigned long long)v149) & 69) >> 6)) & 1) == 1) | |
| v149 = (v149 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | UnaryOp Sqrt) & 340282366920938463463374607431768211455; | |
| else | |
| sqrt((unsigned long long)v149); | |
| v151 = v150 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| if (((char)((CmpF((unsigned long long)v149, (unsigned long long)v151) & 69) >> 2) & 1) || (v152 = 0, ((char)(CmpF((unsigned long long)v149, (unsigned long long)v151)) & 64))) | |
| v152 = (unsigned long long)v96 | (unsigned long long)(v96 >> 64) * 0; | |
| v18.allocator<char> const&) (.constprop.0)("cosine_avx"); | |
| v24.ScopedTimer(v18, &v41); | |
| v18._M_dispose(); | |
| v153 = v146 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v154 = 0; | |
| v155 = v153; | |
| v156 = v153; | |
| while (true) | |
| { | |
| v154 += 8; | |
| if (v154 == 4194312) | |
| break; | |
| v151 = *((int256_t *)(-32 + (char *)v88 + 0x4 * v154)); | |
| v149 = *((int256_t *)(*((long long *)&v57) + v154 * 4 - 32)); | |
| } | |
| v157 = (v149 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v156 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v158 = (v156 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v156 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v159 = (v158 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v157 + v158) & 340282366920938463463374607431768211455; | |
| v160 = (v157 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v159 >> 96) CONCAT (unsigned int)((uint128_t)v159 >> 96) CONCAT (unsigned int)((unsigned long long)v159 >> 32) CONCAT (unsigned int)((unsigned long long)v159 >> 32))) & 340282366920938463463374607431768211455; | |
| v161 = (v159 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v159 + v160) & 340282366920938463463374607431768211455; | |
| v162 = (v160 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v160 >> 64) CONCAT (unsigned long long)(v161 >> 64))) & 340282366920938463463374607431768211455; | |
| v163 = (v161 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v161, (uint128_t)v162)) & 340282366920938463463374607431768211455; | |
| v164 = (v155 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v155 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v165 = ((v162 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v155 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v162 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v155 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v164) & 340282366920938463463374607431768211455; | |
| v166 = (v151 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v165 >> 96) CONCAT (unsigned int)((uint128_t)v165 >> 96) CONCAT (unsigned int)((unsigned long long)v165 >> 32) CONCAT (unsigned int)((unsigned long long)v165 >> 32))) & 340282366920938463463374607431768211455; | |
| v167 = (v165 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v165 + v166) & 340282366920938463463374607431768211455; | |
| v168 = (v166 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v166 >> 64) CONCAT (unsigned long long)(v167 >> 64))) & 340282366920938463463374607431768211455; | |
| v169 = (v167 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v167, (uint128_t)v168)) & 340282366920938463463374607431768211455; | |
| v170 = v153 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v171 = ((v168 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v153 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v168 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v153 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v170) & 340282366920938463463374607431768211455; | |
| v172 = ((v169 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v169 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v169 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v169 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| v173 = ...; | |
| v174 = ((v173 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v173 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v173 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v173 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| v175 = (v172 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v172, (uint128_t)v174)) & 340282366920938463463374607431768211455; | |
| v176 = v174 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| if (((CmpF((unsigned long long)v176, (unsigned long long)v175) & 69 | (char)((CmpF((unsigned long long)v176, (unsigned long long)v175) & 69) >> 6)) & 1) == 1) | |
| { | |
| v177 = (v175 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | UnaryOp Sqrt) & 340282366920938463463374607431768211455; | |
| v178 = v176 & 340282366920938463463374607431768211455; | |
| v179 = v164 & 340282366920938463463374607431768211455; | |
| v233 = v163 & 340282366920938463463374607431768211455; | |
| v234 = v141 & 340282366920938463463374607431768211455; | |
| v235 = v142 & 340282366920938463463374607431768211455; | |
| v301 = v143 & 340282366920938463463374607431768211455; | |
| } | |
| else | |
| { | |
| *((uint128_t *)&v35) = v163; | |
| v177 = v175 & 340282366920938463463374607431768211455; | |
| v178 = v176 & 340282366920938463463374607431768211455; | |
| v179 = v164 & 340282366920938463463374607431768211455; | |
| v234 = v141 & 340282366920938463463374607431768211455; | |
| v235 = v142 & 340282366920938463463374607431768211455; | |
| v301 = v143 & 340282366920938463463374607431768211455; | |
| sqrt((unsigned long long)v177); | |
| v233 = (v163 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)v35) & 340282366920938463463374607431768211455; | |
| } | |
| v336 = v178 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| if (((char)((CmpF((unsigned long long)v177, (unsigned long long)v336) & 69) >> 2) & 1) || (v180 = 0, ((char)(CmpF((unsigned long long)v177, (unsigned long long)v336)) & 64))) | |
| { | |
| v233 = ((v233 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v233 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v233 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v233 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| v177 = (v177 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | DivV((uint128_t)v233, (uint128_t)v177 & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v179 = (v96 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v177 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| v180 = (unsigned long long)v179; | |
| } | |
| g_408040.char_traits<char>>("SAXPY scalar: checksum=", 0x18, v181); | |
| v182 = (v179 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v116 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v183 = v182; | |
| v184 = v182 >> 64; | |
| v185 = g_408040._M_insert<double>(v183 | v184 * 0); | |
| v185.char_traits<char>>(" time=", 0x7, v181); | |
| v185._M_insert<double>(v38).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("SAXPY AVX : checksum=", 0x18, v181); | |
| v187 = (v182 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v129 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v188 = v187; | |
| v189 = v187 >> 64; | |
| v190 = g_408040._M_insert<double>(v188 | v189 * 0); | |
| v190.char_traits<char>>(" time=", 0x7, v181); | |
| v191 = ...; | |
| v190._M_insert<double>((unsigned long long)v191).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Cosine scalar: value=", 0x15, v181); | |
| v193 = (v187 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v152) & 340282366920938463463374607431768211455; | |
| v194 = v193; | |
| v195 = v193 >> 64; | |
| v196 = g_408040._M_insert<double>(v194 | v195 * 0); | |
| v196.char_traits<char>>(" time=", 0x7, v181); | |
| v196._M_insert<double>(v40).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Cosine AVX : value=", 0x15, v181); | |
| v198 = (v193 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)v180) & 340282366920938463463374607431768211455; | |
| v199 = v198; | |
| v200 = v198 >> 64; | |
| v201 = g_408040._M_insert<double>(v199 | v200 * 0); | |
| v201.char_traits<char>>(" time=", 0x7, v181); | |
| v202 = ...; | |
| v201._M_insert<double>((unsigned long long)v202).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("--------------------------------------------------------\n\n"); | |
| v64 = 4638564681600; | |
| v66.allocator<float> const&) (.constprop.0)(0x1fa400); | |
| v68 = 4638564681600; | |
| v69.allocator<float> const&) (.constprop.0)(0x1fa400); | |
| v70 = 4638564681600; | |
| v71.allocator<float> const&) (.constprop.0)(0x1fa400); | |
| v204 = v66; | |
| v205 = *((long long *)&v67); | |
| if (v204 != v205) | |
| { | |
| v336 = v206 & 340282366920938463463374607431768211455; | |
| v207 = v204; | |
| do | |
| { | |
| v198 = v198 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v208 = v207 + 4; | |
| v209 = (v92 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v198 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)&v208[4]) = MulV((uint128_t)v209, (uint128_t)v336); | |
| v207 = v208; | |
| } while (v207 != v205); | |
| } | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 2: 2D 5-point blur on 1080p image ===\n"); | |
| v42 = 0; | |
| v43 = 0; | |
| v18.allocator<char> const&) (.constprop.0)("blur_scalar"); | |
| v24.ScopedTimer(v18, &v42); | |
| v18._M_dispose(); | |
| v210 = (int)v64; | |
| v211 = v210 * 4; | |
| v30 = v210; | |
| v28 = *((long long *)&v69); | |
| memcpy(*((long long *)&v69), v204, v211); | |
| v212 = v65 - 1; | |
| v213 = v204 + v212 * v211; | |
| v214 = (int)v68; | |
| v23 = v214 * 4; | |
| memcpy(v212 * v23 + v28, v213, v211); | |
| if (v26 > 1) | |
| { | |
| if ((unsigned int)v214 == 1 && !((v215 = 1, v216 = v28 + 4, v32 != 1))) | |
| { | |
| do | |
| { | |
| v215 += 1; | |
| v216 += 4; | |
| *((unsigned int *)(v216 - 4)) = (unsigned int)v92; | |
| *((unsigned int *)(v216 - 4)) = (unsigned int)v92; | |
| } while (v26 != v215); | |
| v9 = v204 + v211; | |
| v218 = v23 + v28; | |
| } | |
| else | |
| { | |
| v219 = v23; | |
| v220 = 1; | |
| v9 = v204 + v211; | |
| v218 = v28 + v219; | |
| v221 = v218; | |
| do | |
| { | |
| v220 += 1; | |
| *(v221) = (unsigned int)v92; | |
| v221[1 + v30] = (unsigned int)v92; | |
| v221 = (char *)v221 + v219; | |
| } while (v26 != v220); | |
| } | |
| v223 = v30; | |
| v224 = v206 & 340282366920938463463374607431768211455; | |
| v1 = v212; | |
| v225 = v32 - 2; | |
| v13 = v225; | |
| v13 |= v14 & 0xffffffff00000000; | |
| v226 = v225 & 0xfffffff8; | |
| v12 = (v225 >> 3) * 32; | |
| v227 = v9; | |
| v228 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v19 = 1 - v223; | |
| v5 = v226 + 1; | |
| v229 = v227 + v211; | |
| v4 = v226; | |
| v230 = v204; | |
| v0 = v213; | |
| v231 = v211; | |
| v232 = v223; | |
| do | |
| { | |
| v27 = v232; | |
| v232 += v30; | |
| if (v32 <= 2) | |
| continue; | |
| if (v21 > 2 && !((v236 = v218 + 4, v237 = v227 + 4, !((char)(char)(v236 - v237 <= 32 ^ 1) & (char)(char)(v236 - (v230 + 8) <= 24 ^ 1)) || v236 - (v229 + 8) <= 24))) | |
| { | |
| if (v21 > 6) | |
| { | |
| v17 = v230; | |
| v238 = 0; | |
| do | |
| { | |
| v239 = (*((int256_t *)(v237 + (char *)v238)) + *((int256_t *)(v227 + v238)) + *((int256_t *)(v227 + 8 + v238)) + *((int256_t *)(v230 + 4 + v238)) + *((int256_t *)(v229 + 1 + v238))) * v228; | |
| v240 = (v107 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v107 & 340282366920938463463374607431768211455, (uint128_t)v239)) & 340282366920938463463374607431768211455; | |
| v241 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v239 >> 32) CONCAT (unsigned int)((unsigned long long)v239 >> 32) CONCAT (unsigned int)((unsigned long long)v239 >> 32) CONCAT (unsigned int)((unsigned long long)v239 >> 32))) & 340282366920938463463374607431768211455; | |
| v242 = (*((int256_t *)(v237 + (char *)v238)) & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96))) & 340282366920938463463374607431768211455; | |
| *((void*)(v236 + (char *)v238)) = v239; | |
| v238 += 32; | |
| v243 = (v240 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v240, (uint128_t)v241)) & 340282366920938463463374607431768211455; | |
| v235 = (v241 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 64) CONCAT (unsigned int)((uint128_t)v239 >> 64))) & 340282366920938463463374607431768211455; | |
| v244 = (v239 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v239 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v245 = (v243 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v243, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| v246 = (v245 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v245, (uint128_t)v242)) & 340282366920938463463374607431768211455; | |
| v247 = (v242 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v244 >> 32) CONCAT (unsigned int)((unsigned long long)v244 >> 32) CONCAT (unsigned int)((unsigned long long)v244 >> 32) CONCAT (unsigned int)((unsigned long long)v244 >> 32))) & 340282366920938463463374607431768211455; | |
| v248 = (v246 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v246, (uint128_t)v244)) & 340282366920938463463374607431768211455; | |
| v249 = (v248 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v248, (uint128_t)v247)) & 340282366920938463463374607431768211455; | |
| v250 = (v247 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 64) CONCAT (unsigned int)((uint128_t)v244 >> 64))) & 340282366920938463463374607431768211455; | |
| v234 = (v250 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v249, (uint128_t)v250) & 340282366920938463463374607431768211455, ((unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| } while (v12 != v238); | |
| v230 = v17; | |
| if (!((char)v13 & 7)) | |
| continue; | |
| v251 = v8; | |
| if (v10 - 3 > 2) | |
| { | |
| v252 = v4; | |
| v253 = v5; | |
| } | |
| else | |
| { | |
| v254 = v5; | |
| LABEL_4021cc: | |
| v266 = v254; | |
| v267 = v266 * 4; | |
| v268 = v227 + v267; | |
| v17 = v266 + 1; | |
| v15 = v267 + 4; | |
| v269 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *(v268))) & 340282366920938463463374607431768211455; | |
| v270 = v15 + v227; | |
| v27 = v268; | |
| v271 = (v269 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v269, *(v270))) & 340282366920938463463374607431768211455; | |
| v272 = (v271 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v271, *((int *)((char *)v230 + v267)))) & 340282366920938463463374607431768211455; | |
| v273 = (v272 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v272, *((int *)((char *)v204 + 4 * v232 + 4 * v266)))) & 340282366920938463463374607431768211455; | |
| v217 = (v273 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v273, (uint128_t)v224)) & 340282366920938463463374607431768211455; | |
| v233 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, v33)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v28 + 4 * v266 + 4 * v35)) = v217; | |
| if (v16 > v254 + 1) | |
| { | |
| v274 = v254 + 2; | |
| v11 = v267 + 8; | |
| v275 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *(v27))) & 340282366920938463463374607431768211455; | |
| v276 = (v275 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v275, *((int *)(v11 + (char *)v227)))) & 340282366920938463463374607431768211455; | |
| v277 = (v276 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v276, *((int *)((char *)v230 + v15)))) & 340282366920938463463374607431768211455; | |
| v278 = (v277 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v277, *((int *)((char *)v204 + 4 * v232 + 4 * v17)))) & 340282366920938463463374607431768211455; | |
| v217 = (v278 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v278, (uint128_t)v224)) & 340282366920938463463374607431768211455; | |
| v234 = (v234 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, (uint128_t)v233)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v28 + 4 * v35 + 4 * v17)) = v217; | |
| if (v16 > v274) | |
| { | |
| v279 = v274; | |
| v280 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *(v270))) & 340282366920938463463374607431768211455; | |
| v281 = (v280 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v280, *((int *)(12 + (char *)v227 + v267)))) & 340282366920938463463374607431768211455; | |
| v282 = (v281 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v281, *((int *)((char *)v230 + v11)))) & 340282366920938463463374607431768211455; | |
| v283 = (v282 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v282, *((int *)((char *)v204 + 4 * v232 + 4 * v279)))) & 340282366920938463463374607431768211455; | |
| v217 = (v283 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v283, (uint128_t)v224)) & 340282366920938463463374607431768211455; | |
| v233 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v28 + 4 * v279 + 4 * v35)) = v217; | |
| goto LABEL_402330; | |
| } | |
| } | |
| } | |
| } | |
| else | |
| { | |
| v251 = v14; | |
| v252 = 0; | |
| v253 = 1; | |
| } | |
| v255 = v27 + v252; | |
| v256 = v255 + 1; | |
| v257 = (v217 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (*((int128_t *)(-4 + (char *)v204 + 4 * v256)) & 340282366920938463463374607431768211455) + *((int128_t *)((char *)v204 + 4 * v256))) & 340282366920938463463374607431768211455; | |
| v258 = (v257 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v257 + *((int128_t *)(4 + (char *)v204 + 4 * v256))) & 340282366920938463463374607431768211455; | |
| v259 = (v258 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v258 + *((int128_t *)((char *)v204 + 4 * v255 + 4 * v19))) & 340282366920938463463374607431768211455; | |
| v260 = (v259 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v259 + *((int128_t *)(4 + (char *)v204 + 4 * v232 + 4 * v252))) & 340282366920938463463374607431768211455; | |
| v261 = (v260 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v260 * ((v198 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)v224 CONCAT (unsigned int)v224 CONCAT (unsigned int)v224 CONCAT (unsigned int)v224)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v262 = (v107 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v107 & 340282366920938463463374607431768211455, (uint128_t)v261)) & 340282366920938463463374607431768211455; | |
| v263 = (v234 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v261 >> 32) CONCAT (unsigned int)((unsigned long long)v261 >> 32) CONCAT (unsigned int)((unsigned long long)v261 >> 32) CONCAT (unsigned int)((unsigned long long)v261 >> 32))) & 340282366920938463463374607431768211455; | |
| *((uint128_t *)(4 + (char *)v28 + 4 * v35 + 4 * v252)) = v261; | |
| v254 = (v251 & 0xfffffffc) + v253; | |
| v264 = (v262 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v262, (uint128_t)v263)) & 340282366920938463463374607431768211455; | |
| v234 = (v263 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 64) CONCAT (unsigned int)((uint128_t)v261 >> 64))) & 340282366920938463463374607431768211455; | |
| v217 = (v261 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96))) & 340282366920938463463374607431768211455; | |
| v265 = (v264 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v264, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| v233 = (v265 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v265, (uint128_t)v217)) & 340282366920938463463374607431768211455; | |
| if (!(v251 & 3)) | |
| continue; | |
| goto LABEL_4021cc; | |
| } | |
| else | |
| { | |
| v284 = 1; | |
| do | |
| { | |
| v285 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)(-4 + (char *)v227 + 4 * v284)))) & 340282366920938463463374607431768211455; | |
| v286 = (v285 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v285, *((int *)(4 + (char *)v227 + 4 * v284)))) & 340282366920938463463374607431768211455; | |
| v287 = (v286 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v286, *((int *)((char *)v230 + 4 * v284)))) & 340282366920938463463374607431768211455; | |
| v288 = (v287 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v287, v229[v284])) & 340282366920938463463374607431768211455; | |
| v217 = (v288 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v288, (uint128_t)v224)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, v33)) & 340282366920938463463374607431768211455; | |
| v218[v284] = v217; | |
| v284 += 1; | |
| } while (v284 != v32 - 1); | |
| } | |
| LABEL_402330: | |
| v227 += v231; | |
| v230 += v231; | |
| v229 = (char *)v229 + v231; | |
| v218 = (char *)v218 + v23; | |
| } while (v26 != v29); | |
| v211 = v231; | |
| v212 = v1; | |
| v213 = v0; | |
| v202 = v217 & 340282366920938463463374607431768211455; | |
| v336 = v224 & 340282366920938463463374607431768211455; | |
| v233 &= 340282366920938463463374607431768211455; | |
| v234 &= 340282366920938463463374607431768211455; | |
| v235 &= 340282366920938463463374607431768211455; | |
| v301 &= 340282366920938463463374607431768211455; | |
| } | |
| v18.allocator<char> const&) (.constprop.0)("blur_avx"); | |
| v24.ScopedTimer(v18, &v43); | |
| v18._M_dispose(); | |
| v27 = *((long long *)&v71); | |
| memcpy(*((long long *)&v71), v204, v211); | |
| v289 = (int)v70; | |
| v290 = v289 * 4; | |
| v8 = v289; | |
| memcpy(v27 + v290 * v212, v213, v211); | |
| if (v26 > 1) | |
| { | |
| v291 = v36; | |
| if (v291 == 1 && !((v292 = v27 + 4, v32 != 1))) | |
| { | |
| do | |
| { | |
| v291 += 1; | |
| v292 += 4; | |
| *((unsigned int *)(v292 - 4)) = (unsigned int)v92; | |
| *((unsigned int *)(v292 - 4)) = (unsigned int)v92; | |
| } while (v26 != v291); | |
| v9 = v204 + v211; | |
| } | |
| else | |
| { | |
| v294 = 1; | |
| v9 = v204 + v211; | |
| v295 = v27 + v290; | |
| do | |
| { | |
| v294 += 1; | |
| *(v295) = (unsigned int)v92; | |
| v295[1 + v30] = (unsigned int)v92; | |
| v295 = (char *)v295 + v290; | |
| } while (v26 != v294); | |
| } | |
| v29 = 0; | |
| v296 = v96 & 340282366920938463463374607431768211455; | |
| v297 = v30; | |
| v298 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v299 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)v296 CONCAT (unsigned int)v296 CONCAT (unsigned int)v296 CONCAT (unsigned int)v296)) & 340282366920938463463374607431768211455; | |
| v300 = (v234 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | 4489188110467124429) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455; | |
| v28 = v8; | |
| do | |
| { | |
| v302 = v297; | |
| v303 = v302 * 4; | |
| v304 = v204 + v303; | |
| v305 = v302 + v30; | |
| v306 = v204 + v29 * 4; | |
| v307 = v303 + v9; | |
| v308 = v27 + v28 * 4; | |
| if (v32 > 8) | |
| { | |
| v309 = 9; | |
| while (true) | |
| { | |
| v301 = *((int256_t *)&v304[8 + v309]); | |
| v310 = v309 + 8; | |
| v311 = (v301 + *((int256_t *)&v304[9 + v309]) + *((int256_t *)&v304[7 + v309]) + *((int256_t *)&v306[8 + v309]) + *((int256_t *)&v307[8 + v309])) * v298; | |
| v312 = (v336 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v311 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| *((void*)&v308[8 + v309]) = v311; | |
| v313 = (v311 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v311 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v314 = (v313 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v312 + v313) & 340282366920938463463374607431768211455; | |
| v315 = (v312 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v314 >> 96) CONCAT (unsigned int)((uint128_t)v314 >> 96) CONCAT (unsigned int)((unsigned long long)v314 >> 32) CONCAT (unsigned int)((unsigned long long)v314 >> 32))) & 340282366920938463463374607431768211455; | |
| v316 = (v314 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v314 + v315) & 340282366920938463463374607431768211455; | |
| v336 = (v315 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v315 >> 64) CONCAT (unsigned long long)(v316 >> 64))) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v316, (uint128_t)v336) & 340282366920938463463374607431768211455, v37)) & 340282366920938463463374607431768211455; | |
| if (v310 == (v32 - 9 >> 3) * 8 + 17) | |
| break; | |
| v309 = v310; | |
| } | |
| v317 = v309 & 4294967295; | |
| } | |
| else | |
| { | |
| v317 = 1; | |
| } | |
| if ((unsigned int)v317 >= v20) | |
| continue; | |
| v318 = v317 & 4294967295; | |
| v319 = v32 - v317; | |
| if (v319 != 2) | |
| { | |
| v13 = v302 + v318; | |
| v320 = v13 * 4; | |
| v15 = v318 + v28; | |
| v321 = v27 + v15 * 4; | |
| v16 = v204 + v320; | |
| v12 = v318 + v29; | |
| v11 = v305 + v318; | |
| v25 = v11 * 4 + 4; | |
| v17 = v12 * 4 + 4; | |
| if (!((v321 - (v204 + v25) <= 8 ^ 1) & v21) || v321 - v16 <= 16) | |
| goto LABEL_404430; | |
| v322 = v319 - 1; | |
| if (v319 - 2 > 2) | |
| { | |
| v323 = (v293 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | *(v16)) & 340282366920938463463374607431768211455; | |
| v324 = (v323 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v323 + *((int128_t *)(-4 + (char *)v204 + v320))) & 340282366920938463463374607431768211455; | |
| v325 = (v324 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v324 + *((int128_t *)(4 + (char *)v204 + v320))) & 340282366920938463463374607431768211455; | |
| v326 = (v325 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v325 + *((int128_t *)(-4 + (char *)v204 + v17))) & 340282366920938463463374607431768211455; | |
| v327 = (v326 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v326 + *((int128_t *)(-4 + (char *)v204 + v25))) & 340282366920938463463374607431768211455; | |
| v328 = (v327 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v327 * v299) & 340282366920938463463374607431768211455; | |
| v329 = (v206 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v328)) & 340282366920938463463374607431768211455; | |
| v330 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v328 >> 32) CONCAT (unsigned int)((unsigned long long)v328 >> 32) CONCAT (unsigned int)((unsigned long long)v328 >> 32) CONCAT (unsigned int)((unsigned long long)v328 >> 32))) & 340282366920938463463374607431768211455; | |
| *(v321) = v328; | |
| v331 = (v329 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v329, (uint128_t)v330)) & 340282366920938463463374607431768211455; | |
| v332 = (v330 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 64) CONCAT (unsigned int)((uint128_t)v328 >> 64))) & 340282366920938463463374607431768211455; | |
| v293 = (v328 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96))) & 340282366920938463463374607431768211455; | |
| v336 = (v331 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v331, (uint128_t)v332)) & 340282366920938463463374607431768211455; | |
| v235 = (v332 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v336, (uint128_t)v293)) & 340282366920938463463374607431768211455; | |
| if (!((char)v322 & 3)) | |
| continue; | |
| v333 = v322 & 0xfffffffc; | |
| v334 = (unsigned int)(v319 - v333); | |
| v317 += v333; | |
| v335 = v334 - 1; | |
| if (v334 == 2) | |
| goto LABEL_4027ea; | |
| } | |
| else | |
| { | |
| v335 = v322; | |
| v333 = 0; | |
| } | |
| v337 = v13 + v333; | |
| v338 = v337 * 4; | |
| v339 = (v336 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)((char *)v204 + 4 * v337))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455; | |
| v340 = (v293 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)(-4 + (char *)v204 + v338))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455; | |
| v341 = (v339 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)(4 + (char *)v204 + v338))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455; | |
| v342 = ((v340 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v340 + v339) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v340 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v340 + v339) & 340282366920938463463374607431768211455) + v341) & 340282366920938463463374607431768211455; | |
| v343 = (v341 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)((char *)v204 + 4 * v12 + 0x4 * v333))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455; | |
| v344 = ((v342 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v342 + v343) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((uint128_t)v342 + (uint128_t)v343 & 340282366920938463463374607431768211455) + (((uint128_t)v343 & 0xffffffffffffffff0000000000000000 | *((long long *)((char *)v204 + 4 * v11 + 0x4 * v333))) & 18446744073709551615 & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v345 = (v344 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v344 * v300) & 340282366920938463463374607431768211455; | |
| v336 = (v206 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v345)) & 340282366920938463463374607431768211455; | |
| *((unsigned long long *)((char *)v27 + 4 * v15 + 0x4 * v333)) = v345; | |
| v293 = (v345 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v345 >> 96) CONCAT (unsigned int)((uint128_t)v345 >> 96) CONCAT (unsigned int)((unsigned long long)v345 >> 32) CONCAT (unsigned int)((unsigned long long)v345 >> 32))) & 340282366920938463463374607431768211455; | |
| v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v336)) & 340282366920938463463374607431768211455; | |
| if (!((char)v335 & 1)) | |
| continue; | |
| v317 += v335 & 4294967294; | |
| LABEL_4027ea: | |
| v346 = v317 & 4294967295; | |
| v347 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, v304[v346])) & 340282366920938463463374607431768211455; | |
| v348 = (v347 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v347, v304[1 + v346])) & 340282366920938463463374607431768211455; | |
| v349 = (v348 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v348, v306[v346])) & 340282366920938463463374607431768211455; | |
| v350 = (v349 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v349, v307[v346])) & 340282366920938463463374607431768211455; | |
| v293 = (v350 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v350, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, v37)) & 340282366920938463463374607431768211455; | |
| v308[v346] = v293; | |
| continue; | |
| } | |
| else | |
| { | |
| LABEL_404430: | |
| v351 = v318 * 4; | |
| v352 = v304 + v351 + 4; | |
| v353 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)(v304 + v351)))) & 340282366920938463463374607431768211455; | |
| v354 = (v353 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v353, *((int *)v352))) & 340282366920938463463374607431768211455; | |
| v355 = (v354 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v354, v306[v318])) & 340282366920938463463374607431768211455; | |
| v356 = (v355 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v355, v307[v318])) & 340282366920938463463374607431768211455; | |
| v293 = (v356 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v356, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, v37)) & 340282366920938463463374607431768211455; | |
| v308[v318] = v293; | |
| if (v20 > (unsigned int)v317 + 1) | |
| { | |
| v357 = v304 + v351 + 8; | |
| v358 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v352))) & 340282366920938463463374607431768211455; | |
| v359 = (v358 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v358, *((int *)v357))) & 340282366920938463463374607431768211455; | |
| v360 = (v359 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v359, *((int *)(v306 + v351 + 4)))) & 340282366920938463463374607431768211455; | |
| v361 = (v360 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v360, *((int *)(v307 + v351 + 4)))) & 340282366920938463463374607431768211455; | |
| v293 = (v361 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v361, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v301)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v308 + v351 + 4)) = v293; | |
| if ((unsigned int)v317 + 2 < v20) | |
| { | |
| v362 = v304 + v351 + 12; | |
| v363 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v357))) & 340282366920938463463374607431768211455; | |
| v364 = (v363 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v363, *((int *)v362))) & 340282366920938463463374607431768211455; | |
| v365 = (v364 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v364, *((int *)(v306 + v351 + 8)))) & 340282366920938463463374607431768211455; | |
| v366 = (v365 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v365, *((int *)(v307 + v351 + 8)))) & 340282366920938463463374607431768211455; | |
| v293 = (v366 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v366, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v308 + v351 + 8)) = v293; | |
| if (v20 > (unsigned int)v317 + 3) | |
| { | |
| v367 = v304 + v351 + 16; | |
| v368 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v362))) & 340282366920938463463374607431768211455; | |
| v369 = (v368 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v368, *((int *)v367))) & 340282366920938463463374607431768211455; | |
| v370 = (v369 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v369, *((int *)(v306 + v351 + 12)))) & 340282366920938463463374607431768211455; | |
| v371 = (v370 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v370, *((int *)(v307 + v351 + 12)))) & 340282366920938463463374607431768211455; | |
| v293 = (v371 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v371, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v308 + v351 + 12)) = v293; | |
| if (v20 > (unsigned int)v317 + 4) | |
| { | |
| v372 = v304 + v351 + 20; | |
| v373 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v367))) & 340282366920938463463374607431768211455; | |
| v374 = (v373 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v373, *((int *)v372))) & 340282366920938463463374607431768211455; | |
| v375 = (v374 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v374, *((int *)(v306 + v351 + 16)))) & 340282366920938463463374607431768211455; | |
| v376 = (v375 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v375, *((int *)(v307 + v351 + 16)))) & 340282366920938463463374607431768211455; | |
| v293 = (v376 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v376, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v301)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v308 + v351 + 16)) = v293; | |
| if (v20 > (unsigned int)v317 + 5) | |
| { | |
| v377 = v304 + v351 + 24; | |
| v378 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v372))) & 340282366920938463463374607431768211455; | |
| v379 = (v378 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v378, *((int *)v377))) & 340282366920938463463374607431768211455; | |
| v380 = (v379 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v379, *((int *)(v306 + v351 + 20)))) & 340282366920938463463374607431768211455; | |
| v381 = (v380 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v380, *((int *)(v307 + v351 + 20)))) & 340282366920938463463374607431768211455; | |
| v293 = (v381 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v381, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v308 + v351 + 20)) = v293; | |
| if (v20 > (unsigned int)v317 + 6) | |
| { | |
| v382 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v377))) & 340282366920938463463374607431768211455; | |
| v383 = (v382 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v382, *((int *)(v304 + v351 + 28)))) & 340282366920938463463374607431768211455; | |
| v384 = (v383 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v383, *((int *)(v306 + v351 + 24)))) & 340282366920938463463374607431768211455; | |
| v385 = (v384 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v384, *((int *)(v307 + v351 + 24)))) & 340282366920938463463374607431768211455; | |
| v293 = (v385 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v385, (uint128_t)v296)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v301)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v308 + v351 + 24)) = v293; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| v28 += v8; | |
| v29 += v30; | |
| v386 = v22; | |
| v297 = v305; | |
| } while (v26 != (unsigned int)v386); | |
| v202 = v293 & 340282366920938463463374607431768211455; | |
| v336 &= 340282366920938463463374607431768211455; | |
| v233 = v299 & 340282366920938463463374607431768211455; | |
| v234 = v300 & 340282366920938463463374607431768211455; | |
| v235 &= 340282366920938463463374607431768211455; | |
| } | |
| g_408040.char_traits<char>>("Blur scalar: checksum=", 0x16, v386); | |
| v387 = v202 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v33; | |
| v388 = v387 >> 64; | |
| v389 = g_408040._M_insert<double>((unsigned long long)v387 | v388 * 0); | |
| v389.char_traits<char>>(" time=", 0x7, v181); | |
| v389._M_insert<double>(v42).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Blur AVX : checksum=", 0x16, v181); | |
| v391 = ((v387 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v388 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v42) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v37; | |
| v392 = v391 >> 64; | |
| v393 = g_408040._M_insert<double>((unsigned long long)v391 | v392 * 0); | |
| v393.char_traits<char>>(" time=", 0x7, v181); | |
| v393._M_insert<double>(v43).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Checksum delta (AVX - scalar): ", 0x1f, v181); | |
| v395 = (((v391 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v392 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v43) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(SubV(v96 & 340282366920938463463374607431768211455, v33))) & 340282366920938463463374607431768211455; | |
| v396 = ((v395 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v395 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v395 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v395 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| g_408040._M_insert<double>((unsigned long long)v396).char_traits<char>>&, char const*) (.isra.0)("\n"); | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("--------------------------------------------------------\n\n"); | |
| v72.ComplexSoA(0x40000); | |
| v75.ComplexSoA(0x40000); | |
| v21 = &v77; | |
| v77.ComplexSoA(0); | |
| v19 = &v79; | |
| v79.ComplexSoA(0); | |
| v22 = &v81; | |
| v81.ComplexSoA(0); | |
| v17 = &v83; | |
| v83.ComplexSoA(0); | |
| fill_complex(&v72, 305441741); | |
| fill_complex(&v75, 2557935324); | |
| v58.allocator<float> const&) (.constprop.0)(16); | |
| v398 = 0; | |
| v399 = v336 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| while (true) | |
| { | |
| v400 = (v396 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | SubV((uint128_t)v399 & 340282366920938463463374607431768211455, 0x40f00000)) & 340282366920938463463374607431768211455; | |
| v401 = (v400 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v400, 0x3e000000)) & 340282366920938463463374607431768211455; | |
| v402 = v105 & 340282366920938463463374607431768211455; | |
| v403 = (v401 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV(((uint128_t)v401 ^ 0x80000000) & 340282366920938463463374607431768211455, (uint128_t)v401)) & 340282366920938463463374607431768211455; | |
| expf((unsigned int)v403); | |
| v404 = (v403 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v403, v34)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v58 + 0x4 * v398)) = v404; | |
| v398 += 1; | |
| if (v398 == 16) | |
| break; | |
| v405 = (v404 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((v206 & 18446744073709551615 | ((uint128_t)((v96 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455) >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, 0x3f000000))) & 340282366920938463463374607431768211455; | |
| v406 = (v405 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v405, 1078530011)) & 340282366920938463463374607431768211455; | |
| v396 = (v406 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v406, 0x3d800000)) & 340282366920938463463374607431768211455; | |
| cosf((unsigned int)v396); | |
| v399 = v206 & 340282366920938463463374607431768211455; | |
| } | |
| v27 = v58; | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 3: Complex multiply + FIR convolution ===\n"); | |
| v44 = 0; | |
| v45 = 0; | |
| v18.allocator<char> const&) (.constprop.0)("complex_mul_scalar"); | |
| v24.ScopedTimer(v18, &v44); | |
| v18._M_dispose(); | |
| v407 = v72; | |
| v11 = *((long long *)&v73); | |
| v408 = *((long long *)&v73) - v407; | |
| v33 = v408; | |
| v409 = v408 >> 2; | |
| v21.resize(v409); | |
| v78.resize(v409); | |
| if (v408) | |
| { | |
| v33 = v408; | |
| v410 = *((long long *)&v77); | |
| v411 = *((long long *)&v78); | |
| if (...) | |
| { | |
| v413 = (v33 ? v409 : 1); | |
| if (v33 > 28) | |
| { | |
| v414 = 0; | |
| v415 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| do | |
| { | |
| v416 = *((int256_t *)(*((long long *)&v74) + (char *)v414)); | |
| v417 = *((int256_t *)(v407 + v414)); | |
| *((void*)(v410 + v414)) = v206; | |
| *((void*)(v411 + v414)) = v92; | |
| v414 += 32; | |
| v418 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, (uint128_t)v206)) & 340282366920938463463374607431768211455; | |
| v419 = (v417 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v206 >> 32) CONCAT (unsigned int)((unsigned long long)v206 >> 32) CONCAT (unsigned int)((unsigned long long)v206 >> 32) CONCAT (unsigned int)((unsigned long long)v206 >> 32))) & 340282366920938463463374607431768211455; | |
| v420 = (v416 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96))) & 340282366920938463463374607431768211455; | |
| v421 = (v418 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v418, (uint128_t)v419)) & 340282366920938463463374607431768211455; | |
| v424 = (v421 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v421, ((unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 64) CONCAT (unsigned int)((uint128_t)v206 >> 64)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v425 = (v424 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v424, (uint128_t)v420)) & 340282366920938463463374607431768211455; | |
| v426 = (v420 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32) CONCAT (unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32) CONCAT (unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32) CONCAT (unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32))) & 340282366920938463463374607431768211455; | |
| } while ((v427 = (v425 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v425, (uint128_t)v423))) & 340282366920938463463374607431768211455, v428 = (v427 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v427, (uint128_t)v426))) & 340282366920938463463374607431768211455, v429 = (v426 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)((unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 64) CONCAT (unsigned int)((uint128_t)v423 >> 64))) & 340282366920938463463374607431768211455, v430 = (v428 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v428, (uint128_t)v429))) & 340282366920938463463374607431768211455, v431 = (v429 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v430, ((unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96)) & 340282366920938463463374607431768211455))) & 340282366920938463463374607431768211455, v414 != (v413 >> 3) * 32)); | |
| v432 = v413 & 18446744073709551608; | |
| if (((char)v413 & 7)) | |
| { | |
| v402 = v415 & 340282366920938463463374607431768211455; | |
| v233 = v431 & 340282366920938463463374607431768211455; | |
| v234 = v422 & 340282366920938463463374607431768211455; | |
| v235 &= 340282366920938463463374607431768211455; | |
| LABEL_402e23: | |
| v433 = v413 - v432; | |
| if (v433 - 1 > 2) | |
| { | |
| v434 = v432 * 4; | |
| *((void*)(v410 + v434)) = v92 & 340282366920938463463374607431768211455; | |
| *((void*)(v411 + v434)) = v206 & 340282366920938463463374607431768211455; | |
| v432 += v433 & 18446744073709551612; | |
| v435 = v92 & 340282366920938463463374607431768211455; | |
| v402 = ((((v402 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v435 >> 32) CONCAT (unsigned int)((unsigned long long)v435 >> 32) CONCAT (unsigned int)((unsigned long long)v435 >> 32) CONCAT (unsigned int)((unsigned long long)v435 >> 32))) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 64) CONCAT (unsigned int)((uint128_t)v435 >> 64))) & 340282366920938463463374607431768211455; | |
| v404 = (v435 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96))) & 340282366920938463463374607431768211455; | |
| if (!((char)v433 & 3)) | |
| goto LABEL_40308b; | |
| } | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v436 = v432 * 4; | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v233)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v410 + 4 * v432)) = (unsigned int)v206; | |
| v437 = v92 & 340282366920938463463374607431768211455; | |
| v438 = v96 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v411 + 4 * v432)) = v437; | |
| v404 = (v437 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v437, v438)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, v30)) & 340282366920938463463374607431768211455; | |
| if (v432 + 1 < v409) | |
| { | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| v439 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(4 + (char *)v410 + v436)) = (unsigned int)v206; | |
| *((unsigned int *)(4 + (char *)v411 + v436)) = v439; | |
| v404 = (v439 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v439, v438)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| if (v432 + 2 < v409) | |
| { | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| v440 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(8 + (char *)v410 + v436)) = (unsigned int)v206; | |
| *((unsigned int *)(8 + (char *)v411 + v436)) = v440; | |
| v404 = (v440 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v440, v438)) & 340282366920938463463374607431768211455; | |
| } | |
| } | |
| } | |
| else | |
| { | |
| v404 = v430 & 340282366920938463463374607431768211455; | |
| v402 = v415 & 340282366920938463463374607431768211455; | |
| v233 = v431 & 340282366920938463463374607431768211455; | |
| v234 = v422 & 340282366920938463463374607431768211455; | |
| v235 &= 340282366920938463463374607431768211455; | |
| } | |
| } | |
| else | |
| { | |
| v432 = 0; | |
| goto LABEL_402e23; | |
| } | |
| } | |
| else | |
| { | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v441 = 0; | |
| do | |
| { | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v107 & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v442 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v410 + 0x4 * v441)) = (unsigned int)v206; | |
| *((unsigned int *)((char *)v411 + 0x4 * v441)) = v442; | |
| v404 = (v442 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v442, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| v441 += 1; | |
| } while (v441 < v409); | |
| } | |
| } | |
| LABEL_40308b: | |
| v18.allocator<char> const&) (.constprop.0)("complex_mul_avx"); | |
| v24.ScopedTimer(v18, &v45); | |
| v18._M_dispose(); | |
| v19.resize(v409); | |
| v80.resize(v409); | |
| if (v33 > 28) | |
| { | |
| v443 = 8; | |
| v444 = *((long long *)&v80); | |
| v402 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| while (true) | |
| { | |
| v445 = *((int256_t *)(*((long long *)&v74) + v443 * 4 - 32)); | |
| v446 = v443 + 8; | |
| v233 = *((int256_t *)(-32 + (char *)v407 + 4 * v443)); | |
| *((void*)(*((long long *)&v79) + v443 * 4 - 32)) = v92; | |
| *((void*)(v444 + v443 * 4 - 32)) = v206; | |
| v447 = (v92 * v402 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v206 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v448 = (v206 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v206 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v449 = (v447 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v447 + v448) & 340282366920938463463374607431768211455; | |
| v450 = (v448 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v449 >> 96) CONCAT (unsigned int)((uint128_t)v449 >> 96) CONCAT (unsigned int)((unsigned long long)v449 >> 32) CONCAT (unsigned int)((unsigned long long)v449 >> 32))) & 340282366920938463463374607431768211455; | |
| v234 = (v445 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v449 + (uint128_t)v450 & 340282366920938463463374607431768211455, ((unsigned long long)(v450 >> 64) CONCAT (unsigned long long)(((v449 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v449 + v450) & 340282366920938463463374607431768211455) >> 64)) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455, v34)) & 340282366920938463463374607431768211455; | |
| if (v409 < v446) | |
| break; | |
| v443 = v446; | |
| } | |
| } | |
| else | |
| { | |
| v443 = 0; | |
| } | |
| if (v443 < v409) | |
| { | |
| v451 = *((long long *)&v74); | |
| v452 = v443 * 4; | |
| v453 = *((long long *)&v75); | |
| v31 = *((long long *)&v80); | |
| v454 = *((long long *)&v79); | |
| v16 = v409 - v443; | |
| v8 = v16 - 1; | |
| if (v8 > 2) | |
| { | |
| v452 = v443 * 4; | |
| v455 = v452 + 4; | |
| v28 = v454 + v452; | |
| v29 = v31 + v452; | |
| v9 = *((long long *)&v76) + v455; | |
| if (!(((char *)v29 - v9 <= 24 ^ 1) & (char *)v28 - v9 > 24 & v28 - ((char *)v453 + v455) > 24 & v28 - (v451 + v455) > 24 & v29 - (v407 + v455) > 24 & v28 - (v407 + v455) > 24 & v29 - (v451 + v455) > 24 & v29 - ((char *)v453 + v455) > 24) || v29 - (v455 + v454) <= 24) | |
| goto LABEL_404661; | |
| if (v8 > 6) | |
| { | |
| v233 = *((int256_t *)(v451 + v452)); | |
| v456 = v16; | |
| v457 = v456 & 18446744073709551608; | |
| v458 = v443 + v457; | |
| v443 = v458; | |
| *((void*)v28) = v92; | |
| v459 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| *((void*)v29) = v206; | |
| v460 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v92)) & 340282366920938463463374607431768211455; | |
| v461 = (v206 * v459 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96))) & 340282366920938463463374607431768211455; | |
| v462 = (v460 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 64) CONCAT (unsigned int)((uint128_t)v92 >> 64))) & 340282366920938463463374607431768211455; | |
| v463 = (v92 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v92 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v464 = (v462 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v462, AddV(((unsigned int)((unsigned long long)v92 >> 32) CONCAT (unsigned int)((unsigned long long)v92 >> 32) CONCAT (unsigned int)((unsigned long long)v92 >> 32) CONCAT (unsigned int)((unsigned long long)v92 >> 32)) & 340282366920938463463374607431768211455, (uint128_t)v460) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v465 = (v461 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v461, (uint128_t)v464)) & 340282366920938463463374607431768211455; | |
| v466 = (v464 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v463 >> 32) CONCAT (unsigned int)((unsigned long long)v463 >> 32) CONCAT (unsigned int)((unsigned long long)v463 >> 32) CONCAT (unsigned int)((unsigned long long)v463 >> 32))) & 340282366920938463463374607431768211455; | |
| v402 = (v466 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v466, AddV((uint128_t)v463, (uint128_t)v465) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v404 = (v463 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v463 >> 96) CONCAT (unsigned int)((uint128_t)v463 >> 96) CONCAT (unsigned int)((uint128_t)v463 >> 96) CONCAT (unsigned int)((uint128_t)v463 >> 96))) & 340282366920938463463374607431768211455; | |
| if (!((char)v456 & 7)) | |
| goto LABEL_403608; | |
| v16 = v456 - v457; | |
| if (v16 - 1 <= 2) | |
| goto LABEL_4034e4; | |
| } | |
| else | |
| { | |
| v458 = v443; | |
| } | |
| v233 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | *((int128_t *)((char *)v451 + 4 * v458))) & 340282366920938463463374607431768211455; | |
| v467 = v458 * 4; | |
| v468 = (v402 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | *((int128_t *)((char *)v453 + 4 * v458))) & 340282366920938463463374607431768211455; | |
| *((void*)(v454 + v467)) = v92 & 340282366920938463463374607431768211455; | |
| *((void*)(v31 + v467)) = v206 & 340282366920938463463374607431768211455; | |
| v443 += v16 & 18446744073709551612; | |
| v469 = v92 & 340282366920938463463374607431768211455; | |
| v402 = (((((v468 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v468 * v233) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v469 >> 32) CONCAT (unsigned int)((unsigned long long)v469 >> 32) CONCAT (unsigned int)((unsigned long long)v469 >> 32) CONCAT (unsigned int)((unsigned long long)v469 >> 32))) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 64) CONCAT (unsigned int)((uint128_t)v469 >> 64))) & 340282366920938463463374607431768211455; | |
| v404 = (v469 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96))) & 340282366920938463463374607431768211455; | |
| if (((char)v16 & 3)) | |
| { | |
| LABEL_4034e4: | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v470 = v443 * 4; | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v233)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v454 + 4 * v443)) = (unsigned int)v206; | |
| v471 = v92 & 340282366920938463463374607431768211455; | |
| v472 = v96 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v31 + 4 * v443)) = v471; | |
| v404 = (v471 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v471, v472)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, v34)) & 340282366920938463463374607431768211455; | |
| if (v443 + 1 < v409) | |
| { | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| v473 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(4 + (char *)v454 + v470)) = (unsigned int)v206; | |
| *((unsigned int *)(4 + (char *)v31 + v470)) = v473; | |
| v404 = (v473 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v473, v472)) & 340282366920938463463374607431768211455; | |
| if (v443 + 2 < v409) | |
| { | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(8 + (char *)v454 + v470)) = (unsigned int)v206; | |
| v474 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(8 + (char *)v31 + v470)) = v474; | |
| v404 = (v474 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v474, v472)) & 340282366920938463463374607431768211455; | |
| } | |
| } | |
| } | |
| } | |
| else | |
| { | |
| LABEL_404661: | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v475 = v92 & 340282366920938463463374607431768211455; | |
| v402 = v105 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v454 + 4 * v443)) = (unsigned int)v206; | |
| *((unsigned int *)((char *)v31 + 4 * v443)) = v475; | |
| v404 = (v475 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v475, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, v34)) & 340282366920938463463374607431768211455; | |
| if (v443 + 1 < v409) | |
| { | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v476 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 4)) = (unsigned int)v206; | |
| *((unsigned int *)(v31 + v452 + 4)) = v476; | |
| v404 = (v476 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v476, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| if (v443 + 2 < v409) | |
| { | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 8)) = (unsigned int)v206; | |
| v477 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v31 + v452 + 8)) = v477; | |
| v404 = (v477 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v477, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235)) & 340282366920938463463374607431768211455; | |
| if (v443 + 3 < v409) | |
| { | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v478 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 12)) = (unsigned int)v206; | |
| *((unsigned int *)(v31 + v452 + 12)) = v478; | |
| v404 = (v478 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v478, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| if (v443 + 4 < v409) | |
| { | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v479 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 16)) = (unsigned int)v206; | |
| *((unsigned int *)(v31 + v452 + 16)) = v479; | |
| v404 = (v479 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v479, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| if (v443 + 5 < v409) | |
| { | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v480 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 20)) = (unsigned int)v206; | |
| *((unsigned int *)(v31 + v452 + 20)) = v480; | |
| v404 = (v480 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v480, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, AddV((uint128_t)v206 & 340282366920938463463374607431768211455, AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| if (v443 + 6 < v409) | |
| { | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v481 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 24)) = (unsigned int)v206; | |
| *((unsigned int *)(v31 + v452 + 24)) = v481; | |
| v404 = (v481 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v481, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| if (v443 + 7 < v409) | |
| { | |
| v234 = v109 & 340282366920938463463374607431768211455; | |
| v233 = v107 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v454 + v452 + 28)) = (unsigned int)v206; | |
| v482 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(v31 + v452 + 28)) = v482; | |
| v404 = (v482 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v482, (uint128_t)v402)) & 340282366920938463463374607431768211455; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| LABEL_403608: | |
| v483 = v404 & 340282366920938463463374607431768211455; | |
| v484 = v402 & 340282366920938463463374607431768211455; | |
| v485 = v233 & 340282366920938463463374607431768211455; | |
| v486 = v234 & 340282366920938463463374607431768211455; | |
| v487 = v235 & 340282366920938463463374607431768211455; | |
| v46 = 0; | |
| v47 = 0; | |
| v18.allocator<char> const&) (.constprop.0)("complex_fir_scalar"); | |
| v24.ScopedTimer(v18, &v46); | |
| v18._M_dispose(); | |
| v488 = *((long long *)&v59) - (char *)v27; | |
| v489 = v488 >> 2; | |
| v22.resize(v409); | |
| v82.resize(v409); | |
| if (v33) | |
| { | |
| v490 = 0; | |
| v491 = *((long long *)&v82); | |
| v485 = v107 & 340282366920938463463374607431768211455; | |
| do | |
| { | |
| if (v488) | |
| { | |
| v492 = 0; | |
| do | |
| { | |
| v492 += 1; | |
| v484 = v105 & 340282366920938463463374607431768211455; | |
| } while (v490 >= v492 && v492 < v489); | |
| v483 = v92 & 340282366920938463463374607431768211455; | |
| } | |
| else | |
| { | |
| v483 = v483 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v484 = (v484 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v483 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| } | |
| v486 = (v486 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v483 & 340282366920938463463374607431768211455, v33)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(*((long long *)&v81) + v490 * 4)) = (unsigned int)v105; | |
| *((unsigned int *)((char *)v491 + 0x4 * v490)) = (unsigned int)v206; | |
| v490 += 1; | |
| } while (v490 < v409); | |
| } | |
| v18.allocator<char> const&) (.constprop.0)("complex_fir_avx"); | |
| v24.ScopedTimer(v18, &v47); | |
| v18._M_dispose(); | |
| v17.resize(v409); | |
| v84.resize(v409); | |
| if (v11 != v407) | |
| { | |
| v493 = 1; | |
| v494 = *((long long *)&v84); | |
| v495 = 0; | |
| do | |
| { | |
| if (v488 > 28 && v493 > 7) | |
| { | |
| v496 = v484 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v497 = 8; | |
| v498 = v493; | |
| while (true) | |
| { | |
| v499 = v498; | |
| if (v409 >= v499) | |
| v487 = *((int256_t *)(-32 + (char *)v407 + 4 * v499)); | |
| v500 = v497 + 8; | |
| v501 = v496; | |
| if (v489 < v500 || !((v501 = v496, v493 >= v500))) | |
| break; | |
| v497 = v500; | |
| v498 = v499 - 8; | |
| } | |
| } | |
| else | |
| { | |
| v496 = v484 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v497 = 0; | |
| v501 = v496; | |
| } | |
| v503 = (v501 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v501 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v504 = ((v483 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v501 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v483 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v501 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v503) & 340282366920938463463374607431768211455; | |
| v505 = (v503 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v504 >> 96) CONCAT (unsigned int)((uint128_t)v504 >> 96) CONCAT (unsigned int)((unsigned long long)v504 >> 32) CONCAT (unsigned int)((unsigned long long)v504 >> 32))) & 340282366920938463463374607431768211455; | |
| v506 = ((v505 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v505 >> 64) CONCAT (unsigned long long)(((v504 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v504 + v505) & 340282366920938463463374607431768211455) >> 64))) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v496 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v507 = v496 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455; | |
| v508 = (v506 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v506 + v507) & 340282366920938463463374607431768211455; | |
| v509 = (v507 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v508 >> 96) CONCAT (unsigned int)((uint128_t)v508 >> 96) CONCAT (unsigned int)((unsigned long long)v508 >> 32) CONCAT (unsigned int)((unsigned long long)v508 >> 32))) & 340282366920938463463374607431768211455; | |
| v484 = (v509 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v509 >> 64) CONCAT (unsigned long long)(((v508 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v508 + v509) & 340282366920938463463374607431768211455) >> 64))) & 340282366920938463463374607431768211455; | |
| if (v495 >= v497 && v497 < v489) | |
| { | |
| do | |
| { | |
| v484 = v105 & 340282366920938463463374607431768211455; | |
| v497 += 1; | |
| } while (v495 >= v497 && v497 < v489); | |
| } | |
| *((unsigned int *)(*((long long *)&v83) + v495 * 4)) = (unsigned int)v92; | |
| v493 += 1; | |
| *((unsigned int *)((char *)v494 + 0x4 * v495)) = (unsigned int)v206; | |
| v495 += 1; | |
| v483 = v92 & 340282366920938463463374607431768211455; | |
| v487 = (v487 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v483, v32)) & 340282366920938463463374607431768211455; | |
| } while (v495 < v409); | |
| v483 &= 340282366920938463463374607431768211455; | |
| v484 &= 340282366920938463463374607431768211455; | |
| v485 = v107 & 340282366920938463463374607431768211455; | |
| v486 &= 340282366920938463463374607431768211455; | |
| v487 &= 340282366920938463463374607431768211455; | |
| } | |
| g_408040.char_traits<char>>("Complex mul scalar: checksum=", 0x1d, v495); | |
| v510 = v483 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v30; | |
| v511 = v510 >> 64; | |
| v512 = g_408040._M_insert<double>((unsigned long long)v510 | v511 * 0); | |
| v512.char_traits<char>>(" time=", 0x7, v181); | |
| v512._M_insert<double>(v44).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Complex mul AVX : checksum=", 0x1d, v181); | |
| v514 = ((v510 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v511 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v44) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v34; | |
| v515 = v514 >> 64; | |
| v516 = g_408040._M_insert<double>((unsigned long long)v514 | v515 * 0); | |
| v516.char_traits<char>>(" time=", 0x7, v181); | |
| v516._M_insert<double>(v45).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("FIR scalar : checksum=", 0x1d, v181); | |
| v518 = ((v514 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v515 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v45) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v33; | |
| v519 = v518 >> 64; | |
| v520 = g_408040._M_insert<double>((unsigned long long)v518 | v519 * 0); | |
| v520.char_traits<char>>(" time=", 0x7, v181); | |
| v520._M_insert<double>(v46).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("FIR AVX : checksum=", 0x1d, v181); | |
| v522 = ((v518 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v519 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v46) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v32; | |
| v523 = v522 >> 64; | |
| v524 = g_408040._M_insert<double>((unsigned long long)v522 | v523 * 0); | |
| v524.char_traits<char>>(" time=", 0x7, v181); | |
| v524._M_insert<double>(v47).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Delta cmul checksum (AVX - scalar): ", 0x24, v181); | |
| v526 = (((v522 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v523 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v47) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(SubV(v96 & 340282366920938463463374607431768211455, v30))) & 340282366920938463463374607431768211455; | |
| v527 = v526 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v526 & 18446744073709551615; | |
| g_408040._M_insert<double>((unsigned long long)v527 | (unsigned long long)(v527 >> 64) * 0).char_traits<char>>&, char const*) (.isra.0)("\n"); | |
| g_408040.char_traits<char>>("Delta FIR checksum (AVX - scalar): ", 0x24, v181); | |
| v529 = ((v527 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v527 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | SubV((uint128_t)v96 & 340282366920938463463374607431768211455, v33)) & 340282366920938463463374607431768211455; | |
| v530 = ((v529 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v529 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v529 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v529 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455; | |
| g_408040._M_insert<double>((unsigned long long)v530).char_traits<char>>&, char const*) (.isra.0)("\n"); | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("--------------------------------------------------------\n\n"); | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 4: Soft clip / limiter on FIR output ===\n"); | |
| v48 = 0; | |
| v49 = 0; | |
| v60.vector(v22); | |
| v62.vector(&v60); | |
| v18.allocator<char> const&) (.constprop.0)("soft_clip_scalar"); | |
| v24.ScopedTimer(v18, &v48); | |
| v18._M_dispose(); | |
| v532 = v60; | |
| v533 = *((long long *)&v61) - v532; | |
| v534 = v533 >> 2; | |
| if (*((long long *)&v61) != v532) | |
| { | |
| v535 = (v533 ? v534 : 1); | |
| if (v533 > 28) | |
| { | |
| v536 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v537 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v538 = v532; | |
| do | |
| { | |
| v539 = *((int256_t *)v538); | |
| v540 = v538 + 32; | |
| v541 = CmpLTV(UnaryOp unpack, UnaryOp unpack) CONCAT CmpLTV(UnaryOp unpack, UnaryOp unpack); | |
| v542 = (v541 | (CmpLTV(UnaryOp unpack, UnaryOp unpack) CONCAT CmpLTV(UnaryOp unpack, UnaryOp unpack))) ^ CmpEQV(v485, v485); | |
| v543 = v537 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | (v539 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | v536 & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31))) & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)); | |
| v544 = (v487 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v543 >> 32) CONCAT (unsigned int)((unsigned long long)v543 >> 32) CONCAT (unsigned int)((unsigned long long)v543 >> 32) CONCAT (unsigned int)((unsigned long long)v543 >> 32))) & 340282366920938463463374607431768211455; | |
| v545 = (v542 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96))) & 340282366920938463463374607431768211455; | |
| *((uint256_t *)&v540[32]) = v543; | |
| v546 = (v109 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v109 & 340282366920938463463374607431768211455, (uint128_t)v543)) & 340282366920938463463374607431768211455; | |
| v487 = (v544 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v544, (uint128_t)v546)) & 340282366920938463463374607431768211455; | |
| v547 = (v546 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 64) CONCAT (unsigned int)((uint128_t)v543 >> 64))) & 340282366920938463463374607431768211455; | |
| v548 = (v543 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v543 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v549 = (v547 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v547, (uint128_t)v487)) & 340282366920938463463374607431768211455; | |
| v550 = (v545 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v545, (uint128_t)v549)) & 340282366920938463463374607431768211455; | |
| v551 = (v549 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v548 >> 32) CONCAT (unsigned int)((unsigned long long)v548 >> 32) CONCAT (unsigned int)((unsigned long long)v548 >> 32) CONCAT (unsigned int)((unsigned long long)v548 >> 32))) & 340282366920938463463374607431768211455; | |
| v552 = (v550 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v548, (uint128_t)v550)) & 340282366920938463463374607431768211455; | |
| v553 = (v551 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v551, (uint128_t)v552)) & 340282366920938463463374607431768211455; | |
| v554 = (v552 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 64) CONCAT (unsigned int)((uint128_t)v548 >> 64))) & 340282366920938463463374607431768211455; | |
| v556 = (v553 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v554, (uint128_t)v553) & 340282366920938463463374607431768211455, ((unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| v538 = v540; | |
| } while (v538 != (v535 >> 3) * 32 + v532); | |
| v557 = v535 & 18446744073709551608; | |
| if (((char)v535 & 7)) | |
| { | |
| v530 = v555 & 340282366920938463463374607431768211455; | |
| v484 = v537 & 340282366920938463463374607431768211455; | |
| v486 = v556 & 340282366920938463463374607431768211455; | |
| LABEL_403d41: | |
| v558 = v535 - v557; | |
| if (v558 - 1 > 2) | |
| { | |
| v559 = v532 + v557 * 4; | |
| v486 = (v486 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v560 = (1061997773 CONCAT 1061997773 CONCAT 1061997773 CONCAT 1061997773) & 340282366920938463463374607431768211455; | |
| v561 = (v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (3209481421 CONCAT 3209481421 CONCAT 3209481421 CONCAT 3209481421)) & 340282366920938463463374607431768211455; | |
| v562 = *(v559) & 340282366920938463463374607431768211455; | |
| v563 = CmpLTV(v560, v562) & 340282366920938463463374607431768211455; | |
| v564 = (v561 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(v562 & SarNV(((v563 | CmpLTV(v562, (uint128_t)v561) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 ^ (uint128_t)v486) & 340282366920938463463374607431768211455, 31) | (uint128_t)v561 & ~(SarNV(((v563 | CmpLTV(v562, (uint128_t)v561) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 ^ (uint128_t)v486) & 340282366920938463463374607431768211455, 31)))) & 340282366920938463463374607431768211455; | |
| v565 = (v564 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v560 & SarNV(v563, 31) | (uint128_t)v564 & ~(SarNV(v563, 31))) & 340282366920938463463374607431768211455; | |
| *(v559) = v565; | |
| v557 += v558 & 18446744073709551612; | |
| v530 = (v565 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v565 >> 96) CONCAT (unsigned int)((uint128_t)v565 >> 96) CONCAT (unsigned int)((uint128_t)v565 >> 96) CONCAT (unsigned int)((uint128_t)v565 >> 96))) & 340282366920938463463374607431768211455; | |
| if (!((char)v558 & 3)) | |
| goto LABEL_403e8f; | |
| } | |
| v566 = v557 * 4; | |
| v567 = v92 & 340282366920938463463374607431768211455; | |
| if (((CmpF((unsigned long long)v567, 1061997773) & 69 | (char)((CmpF((unsigned long long)v567, 1061997773) & 69) >> 6)) & 1) == 1) | |
| v530 = (v567 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MaxV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v567)) & 340282366920938463463374607431768211455; | |
| else | |
| v530 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)((char *)v532 + v566)) = (unsigned int)v92; | |
| if (v557 + 1 < v534) | |
| { | |
| v568 = v92 & 340282366920938463463374607431768211455; | |
| if (((CmpF((unsigned long long)v568, 1061997773) & 69 | (char)((CmpF((unsigned long long)v568, 1061997773) & 69) >> 6)) & 1) == 1) | |
| v530 = (v568 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MaxV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v568)) & 340282366920938463463374607431768211455; | |
| else | |
| v530 = v92 & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(4 + (char *)v532 + v566)) = (unsigned int)v92; | |
| if (v557 + 2 < v534) | |
| { | |
| v569 = v92 & 340282366920938463463374607431768211455; | |
| if (((CmpF((unsigned long long)v569, 1061997773) & 69 | (char)((CmpF((unsigned long long)v569, 1061997773) & 69) >> 6)) & 1) != 1) | |
| v530 = v92 & 340282366920938463463374607431768211455; | |
| else | |
| v530 = (v569 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MaxV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v569)) & 340282366920938463463374607431768211455; | |
| *((unsigned int *)(8 + (char *)v532 + v566)) = (unsigned int)v92; | |
| } | |
| } | |
| } | |
| else | |
| { | |
| v530 = v555 & 340282366920938463463374607431768211455; | |
| v486 = v556 & 340282366920938463463374607431768211455; | |
| } | |
| } | |
| else | |
| { | |
| v557 = 0; | |
| goto LABEL_403d41; | |
| } | |
| } | |
| LABEL_403e8f: | |
| v18.allocator<char> const&) (.constprop.0)("soft_clip_avx"); | |
| v24.ScopedTimer(v18, &v49); | |
| v18._M_dispose(); | |
| v570 = v62; | |
| v571 = *((long long *)&v63) - (char *)v570; | |
| v572 = v571 >> 2; | |
| if (v571 > 28) | |
| { | |
| v573 = 8; | |
| v574 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v575 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| while (true) | |
| { | |
| v576 = v573 + 8; | |
| v577 = MaxV(MinV(*((int256_t *)&v570[8 + v573]), v575), v574); | |
| *((void*)&v570[8 + v573]) = v577; | |
| v578 = (v577 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v577 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v579 = ((v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v577 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v577 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v578) & 340282366920938463463374607431768211455; | |
| v580 = (v578 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v579 >> 96) CONCAT (unsigned int)((uint128_t)v579 >> 96) CONCAT (unsigned int)((unsigned long long)v579 >> 32) CONCAT (unsigned int)((unsigned long long)v579 >> 32))) & 340282366920938463463374607431768211455; | |
| v530 = ((v579 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v579 + v580) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v579 + (uint128_t)v580 & 340282366920938463463374607431768211455, ((unsigned long long)(v580 >> 64) CONCAT (unsigned long long)(((v579 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v579 + v580) & 340282366920938463463374607431768211455) >> 64)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455; | |
| if (v572 < v576) | |
| break; | |
| v573 = v576; | |
| } | |
| } | |
| else | |
| { | |
| v573 = 0; | |
| } | |
| if (v573 < v572) | |
| { | |
| v581 = v572 - v573; | |
| if (v581 - 1 > 6) | |
| { | |
| v582 = &v570[v573]; | |
| v583 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v584 = [D] unsupported_<class 'pyvex.expr.Qop'>(); | |
| v585 = *(v582); | |
| v586 = v583 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | (v585 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | v584 & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31))) & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)); | |
| *(v582) = v586; | |
| v587 = v581 & 18446744073709551608; | |
| v588 = (v586 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v586 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455; | |
| v589 = v573 + v587; | |
| v530 = (v588 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v588 >> 96) CONCAT (unsigned int)((uint128_t)v588 >> 96) CONCAT (unsigned int)((uint128_t)v588 >> 96) CONCAT (unsigned int)((uint128_t)v588 >> 96))) & 340282366920938463463374607431768211455; | |
| if (!((char)v581 & 7)) | |
| goto LABEL_404178; | |
| } | |
| else | |
| { | |
| v589 = v573; | |
| v587 = 0; | |
| } | |
| v590 = v581 - v587; | |
| if (v590 - 1 > 2) | |
| { | |
| v591 = (1061997773 CONCAT 1061997773 CONCAT 1061997773 CONCAT 1061997773) & 340282366920938463463374607431768211455; | |
| v592 = (v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (3209481421 CONCAT 3209481421 CONCAT 3209481421 CONCAT 3209481421)) & 340282366920938463463374607431768211455; | |
| v593 = &v570[v587 + v573]; | |
| v594 = *(v593) & 340282366920938463463374607431768211455; | |
| v589 += v590 & 18446744073709551612; | |
| v595 = CmpLTV(v591, v594) & 340282366920938463463374607431768211455; | |
| v596 = ...; | |
| v597 = (v596 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v591 & SarNV(v595, 31) | (uint128_t)v596 & ~(SarNV(v595, 31))) & 340282366920938463463374607431768211455; | |
| *(v593) = v597; | |
| v530 = (v597 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v597 >> 96) CONCAT (unsigned int)((uint128_t)v597 >> 96) CONCAT (unsigned int)((uint128_t)v597 >> 96) CONCAT (unsigned int)((uint128_t)v597 >> 96))) & 340282366920938463463374607431768211455; | |
| if (!((unsigned int)v590 & 3)) | |
| goto LABEL_404178; | |
| } | |
| v598 = v589 * 4; | |
| v599 = v92 & 340282366920938463463374607431768211455; | |
| v530 = (((CmpF((unsigned long long)v599, 1061997773) & 69 | (CmpF((unsigned long long)v599, 1061997773) & 69) >> 6) & 1) == 1 ? (v599 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(MaxV(v206 & 340282366920938463463374607431768211455, (uint128_t)v599))) & 340282366920938463463374607431768211455 : v92 & 340282366920938463463374607431768211455); | |
| *((unsigned int *)(v570 + v598)) = (unsigned int)v92; | |
| if (v589 + 1 < v572) | |
| { | |
| v600 = v92 & 340282366920938463463374607431768211455; | |
| v530 = (((CmpF((unsigned long long)v600, 1061997773) & 69 | (CmpF((unsigned long long)v600, 1061997773) & 69) >> 6) & 1) == 1 ? (v600 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(MaxV(v206 & 340282366920938463463374607431768211455, (uint128_t)v600))) & 340282366920938463463374607431768211455 : v92 & 340282366920938463463374607431768211455); | |
| *((unsigned int *)(v570 + v598 + 4)) = (unsigned int)v92; | |
| if (v589 + 2 < v572) | |
| { | |
| v601 = v92 & 340282366920938463463374607431768211455; | |
| v530 = (((CmpF((unsigned long long)v601, 1061997773) & 69 | (CmpF((unsigned long long)v601, 1061997773) & 69) >> 6) & 1) == 1 ? (v601 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(MaxV(v206 & 340282366920938463463374607431768211455, (uint128_t)v601))) & 340282366920938463463374607431768211455 : v92 & 340282366920938463463374607431768211455); | |
| *((unsigned int *)(v570 + v598 + 8)) = (unsigned int)v92; | |
| } | |
| } | |
| } | |
| LABEL_404178: | |
| g_408040.char_traits<char>>("Soft clip scalar: checksum=", 0x1b, v570); | |
| v602 = v530 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v33; | |
| v603 = v602 >> 64; | |
| v604 = g_408040._M_insert<double>((unsigned long long)v602 | v603 * 0); | |
| v604.char_traits<char>>(" time=", 0x7, v181); | |
| v604._M_insert<double>(v48).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Soft clip AVX : checksum=", 0x1b, v181); | |
| v606 = ((v602 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v603 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v48) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v34; | |
| v607 = v606 >> 64; | |
| v608 = g_408040._M_insert<double>((unsigned long long)v606 | v607 * 0); | |
| v608.char_traits<char>>(" time=", 0x7, v181); | |
| v608._M_insert<double>(v49).char_traits<char>>&, char const*) (.isra.0)(" ms\n"); | |
| g_408040.char_traits<char>>("Delta clip checksum (AVX - scalar): ", 0x24, v181); | |
| v610 = (((v606 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v607 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v49) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(SubV(v96 & 340282366920938463463374607431768211455, v33))) & 340282366920938463463374607431768211455; | |
| g_408040._M_insert<double>((unsigned long long)v610 | (unsigned long long)((v610 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v610 & 18446744073709551615) >> 64) * 0).char_traits<char>>&, char const*) (.isra.0)("\n"); | |
| g_408040.char_traits<char>>&, char const*) (.isra.0)("\nDone.\n"); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <immintrin.h> | |
| #include <algorithm> | |
| #include <chrono> | |
| #include <cmath> | |
| #include <complex> | |
| #include <cstdint> | |
| #include <iostream> | |
| #include <numeric> | |
| #include <random> | |
| #include <string> | |
| #include <vector> | |
| using f32 = float; | |
| using f64 = double; | |
| using u32 = std::uint32_t; | |
| using u64 = std::uint64_t; | |
| constexpr std::size_t N_SAXPY = 0x400000u; // 4,194,304 elements | |
| constexpr int WIDTH = 1920; | |
| constexpr int HEIGHT = 1080; | |
| constexpr std::size_t N_PIXELS = static_cast<std::size_t>(WIDTH) * HEIGHT; // 2,073,600 | |
| constexpr std::size_t N_COMPLEX = 0x40000u; // 262,144 elements | |
| constexpr int FIR_TAPS = 16; | |
| // ----------------------------------------------------------------------------- | |
| // ScopedTimer: RAII timing helper writing elapsed time [ms] to a referenced slot | |
| // ----------------------------------------------------------------------------- | |
| struct ScopedTimer | |
| { | |
| using clock = std::chrono::high_resolution_clock; | |
| std::string label; | |
| double &out_ms; | |
| clock::time_point t0; | |
| ScopedTimer(const std::string &name, double &target) | |
| : label(name), out_ms(target), t0(clock::now()) | |
| { | |
| } | |
| ~ScopedTimer() | |
| { | |
| const auto t1 = clock::now(); | |
| out_ms = std::chrono::duration<double, std::milli>(t1 - t0).count(); | |
| } | |
| }; | |
| // ----------------------------------------------------------------------------- | |
| // ComplexSoA: separate real/imaginary storage for AVX-friendly layout | |
| // ----------------------------------------------------------------------------- | |
| struct ComplexSoA | |
| { | |
| std::vector<f32> re; | |
| std::vector<f32> im; | |
| explicit ComplexSoA(std::size_t n = 0) : re(n), im(n) {} | |
| std::size_t size() const { return re.size(); } | |
| }; | |
| // ----------------------------------------------------------------------------- | |
| // Checksums | |
| // ----------------------------------------------------------------------------- | |
| static double checksum_real(const std::vector<f32> &v) | |
| { | |
| double sum = 0.0; | |
| for (f32 x : v) | |
| sum += static_cast<double>(x); | |
| return sum; | |
| } | |
| static double checksum_complex(const ComplexSoA &c) | |
| { | |
| double sum = 0.0; | |
| const std::size_t n = c.size(); | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| sum += static_cast<double>(c.re[i]) + | |
| static_cast<double>(c.im[i]); | |
| } | |
| return sum; | |
| } | |
| // ----------------------------------------------------------------------------- | |
| // Initialisation helpers | |
| // ----------------------------------------------------------------------------- | |
| static void init_saxpy_vectors(std::vector<f32> &x, | |
| std::vector<f32> &y) | |
| { | |
| const std::size_t n = x.size(); | |
| std::mt19937 rng(0x1234abcdU); | |
| std::uniform_real_distribution<f32> dist(-1.0f, 1.0f); | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| x[i] = dist(rng); | |
| y[i] = dist(rng); | |
| } | |
| } | |
| static void init_image(std::vector<f32> &img, | |
| int width, | |
| int height) | |
| { | |
| const std::size_t n = static_cast<std::size_t>(width) * height; | |
| std::mt19937 rng(0x9876fedcU); | |
| std::uniform_real_distribution<f32> dist(0.0f, 1.0f); | |
| for (std::size_t i = 0; i < n; ++i) | |
| img[i] = dist(rng); | |
| } | |
| static void fill_complex(ComplexSoA &c, u32 seed) | |
| { | |
| const std::size_t n = c.size(); | |
| std::mt19937 rng(seed); | |
| std::uniform_real_distribution<f32> dist(-1.0f, 1.0f); | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| c.re[i] = dist(rng); | |
| c.im[i] = dist(rng); | |
| } | |
| } | |
| // FIR kernel: 16-tap Gaussian-shaped low-pass, normalised to sum=1 | |
| static std::vector<f32> make_fir_kernel() | |
| { | |
| std::vector<f32> h(FIR_TAPS); | |
| const float center = 7.5f; // matches (i - 7.5) in decompiled constants | |
| const float scale = 0.125f; // 0x3e000000 | |
| const float pi = 3.1415927410f; // 0x40490fdb | |
| for (int i = 0; i < FIR_TAPS; ++i) | |
| { | |
| const float x = (static_cast<float>(i) - center) * scale; | |
| const float g = std::exp(-x * x); | |
| h[i] = g; | |
| // Extra trigonometric path used only to exercise exp/cos-like work; | |
| // result not stored. | |
| const float phase = (x + 0.5f) * pi * 0.0625f; // 0x3f000000 * pi * 0x3d800000 | |
| (void)std::cos(phase); | |
| } | |
| float sum = 0.0f; | |
| for (float v : h) | |
| sum += v; | |
| if (sum != 0.0f) | |
| { | |
| const float inv = 1.0f / sum; | |
| for (float &v : h) | |
| v *= inv; | |
| } | |
| return h; | |
| } | |
| // ----------------------------------------------------------------------------- | |
| // Workload 1: SAXPY + cosine similarity | |
| // ----------------------------------------------------------------------------- | |
| static double saxpy_scalar(std::vector<f32> &y, | |
| const std::vector<f32> &x, | |
| f32 a) | |
| { | |
| const std::size_t n = y.size(); | |
| double sum = 0.0; | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| const float yi = a * x[i] + y[i]; | |
| y[i] = yi; | |
| sum += static_cast<double>(yi); | |
| } | |
| return sum; | |
| } | |
| static double saxpy_avx(std::vector<f32> &y, | |
| const std::vector<f32> &x, | |
| f32 a) | |
| { | |
| const std::size_t n = y.size(); | |
| const float *px = x.data(); | |
| float *py = y.data(); | |
| __m256 va = _mm256_set1_ps(a); | |
| __m256 vacc = _mm256_setzero_ps(); | |
| std::size_t i = 0; | |
| alignas(32) float tmp[8]; | |
| for (; i + 8 <= n; i += 8) | |
| { | |
| __m256 vx = _mm256_loadu_ps(px + i); | |
| __m256 vy = _mm256_loadu_ps(py + i); | |
| __m256 vz = _mm256_add_ps(_mm256_mul_ps(va, vx), vy); | |
| _mm256_storeu_ps(py + i, vz); | |
| vacc = _mm256_add_ps(vacc, vz); | |
| } | |
| _mm256_storeu_ps(tmp, vacc); | |
| double sum = 0.0; | |
| for (int j = 0; j < 8; ++j) | |
| sum += static_cast<double>(tmp[j]); | |
| for (; i < n; ++i) | |
| { | |
| const float yi = a * px[i] + py[i]; | |
| py[i] = yi; | |
| sum += static_cast<double>(yi); | |
| } | |
| return sum; | |
| } | |
| static double cosine_scalar(const std::vector<f32> &x, | |
| const std::vector<f32> &y) | |
| { | |
| const std::size_t n = x.size(); | |
| double dot = 0.0; | |
| double nx2 = 0.0; | |
| double ny2 = 0.0; | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| const double xi = x[i]; | |
| const double yi = y[i]; | |
| dot += xi * yi; | |
| nx2 += xi * xi; | |
| ny2 += yi * yi; | |
| } | |
| const double denom = std::sqrt(nx2) * std::sqrt(ny2); | |
| if (denom == 0.0) | |
| return 0.0; | |
| return dot / denom; | |
| } | |
| static double cosine_avx(const std::vector<f32> &x, | |
| const std::vector<f32> &y) | |
| { | |
| const std::size_t n = x.size(); | |
| const float *px = x.data(); | |
| const float *py = y.data(); | |
| __m256 vdot = _mm256_setzero_ps(); | |
| __m256 vnx2 = _mm256_setzero_ps(); | |
| __m256 vny2 = _mm256_setzero_ps(); | |
| std::size_t i = 0; | |
| alignas(32) float tmp_dot[8]; | |
| alignas(32) float tmp_nx2[8]; | |
| alignas(32) float tmp_ny2[8]; | |
| for (; i + 8 <= n; i += 8) | |
| { | |
| __m256 vx = _mm256_loadu_ps(px + i); | |
| __m256 vy = _mm256_loadu_ps(py + i); | |
| __m256 vx2 = _mm256_mul_ps(vx, vx); | |
| __m256 vy2 = _mm256_mul_ps(vy, vy); | |
| __m256 prod = _mm256_mul_ps(vx, vy); | |
| vdot = _mm256_add_ps(vdot, prod); | |
| vnx2 = _mm256_add_ps(vnx2, vx2); | |
| vny2 = _mm256_add_ps(vny2, vy2); | |
| } | |
| _mm256_storeu_ps(tmp_dot, vdot); | |
| _mm256_storeu_ps(tmp_nx2, vnx2); | |
| _mm256_storeu_ps(tmp_ny2, vny2); | |
| double dot = 0.0; | |
| double nx2 = 0.0; | |
| double ny2 = 0.0; | |
| for (int j = 0; j < 8; ++j) | |
| { | |
| dot += static_cast<double>(tmp_dot[j]); | |
| nx2 += static_cast<double>(tmp_nx2[j]); | |
| ny2 += static_cast<double>(tmp_ny2[j]); | |
| } | |
| for (; i < n; ++i) | |
| { | |
| const double xi = px[i]; | |
| const double yi = py[i]; | |
| dot += xi * yi; | |
| nx2 += xi * xi; | |
| ny2 += yi * yi; | |
| } | |
| const double denom = std::sqrt(nx2) * std::sqrt(ny2); | |
| if (denom == 0.0) | |
| return 0.0; | |
| return dot / denom; | |
| } | |
| // ----------------------------------------------------------------------------- | |
| // Workload 2: 2D 5-point blur on 1080p image | |
| // ----------------------------------------------------------------------------- | |
| static double blur5_scalar(const std::vector<f32> &src, | |
| std::vector<f32> &dst, | |
| int width, | |
| int height) | |
| { | |
| const std::size_t w = static_cast<std::size_t>(width); | |
| const std::size_t h = static_cast<std::size_t>(height); | |
| // Copy borders | |
| for (std::size_t x = 0; x < w; ++x) | |
| { | |
| dst[x] = src[x]; | |
| dst[(h - 1) * w + x] = src[(h - 1) * w + x]; | |
| } | |
| for (std::size_t y = 1; y + 1 < h; ++y) | |
| { | |
| dst[y * w] = src[y * w]; | |
| dst[y * w + (w - 1)] = src[y * w + (w - 1)]; | |
| } | |
| // Interior 5-point stencil | |
| const float scale = 0.2f; | |
| for (std::size_t y = 1; y + 1 < h; ++y) | |
| { | |
| for (std::size_t x = 1; x + 1 < w; ++x) | |
| { | |
| const std::size_t idx = y * w + x; | |
| const float c = src[idx]; | |
| const float up = src[idx - w]; | |
| const float dn = src[idx + w]; | |
| const float lf = src[idx - 1]; | |
| const float rt = src[idx + 1]; | |
| dst[idx] = scale * (c + up + dn + lf + rt); | |
| } | |
| } | |
| return checksum_real(dst); | |
| } | |
| static double blur5_avx(const std::vector<f32> &src, | |
| std::vector<f32> &dst, | |
| int width, | |
| int height) | |
| { | |
| const std::size_t w = static_cast<std::size_t>(width); | |
| const std::size_t h = static_cast<std::size_t>(height); | |
| const float *ps = src.data(); | |
| float *pd = dst.data(); | |
| // Copy borders (scalar) | |
| for (std::size_t x = 0; x < w; ++x) | |
| { | |
| pd[x] = ps[x]; | |
| pd[(h - 1) * w + x] = ps[(h - 1) * w + x]; | |
| } | |
| for (std::size_t y = 1; y + 1 < h; ++y) | |
| { | |
| pd[y * w] = ps[y * w]; | |
| pd[y * w + (w - 1)] = ps[y * w + (w - 1)]; | |
| } | |
| const __m256 vscale = _mm256_set1_ps(0.2f); | |
| for (std::size_t y = 1; y + 1 < h; ++y) | |
| { | |
| std::size_t x = 1; | |
| for (; x + 7 < w - 1; x += 8) | |
| { | |
| const std::size_t idx = y * w + x; | |
| __m256 c = _mm256_loadu_ps(ps + idx); | |
| __m256 up = _mm256_loadu_ps(ps + idx - w); | |
| __m256 dn = _mm256_loadu_ps(ps + idx + w); | |
| __m256 lf = _mm256_loadu_ps(ps + idx - 1); | |
| __m256 rt = _mm256_loadu_ps(ps + idx + 1); | |
| __m256 sum1 = _mm256_add_ps(c, up); | |
| __m256 sum2 = _mm256_add_ps(dn, lf); | |
| __m256 sum = _mm256_add_ps(_mm256_add_ps(sum1, sum2), rt); | |
| __m256 out = _mm256_mul_ps(sum, vscale); | |
| _mm256_storeu_ps(pd + idx, out); | |
| } | |
| // Tail | |
| for (; x + 1 < w; ++x) | |
| { | |
| const std::size_t idx = y * w + x; | |
| const float c = ps[idx]; | |
| const float up = ps[idx - w]; | |
| const float dn = ps[idx + w]; | |
| const float lf = ps[idx - 1]; | |
| const float rt = ps[idx + 1]; | |
| pd[idx] = 0.2f * (c + up + dn + lf + rt); | |
| } | |
| } | |
| return checksum_real(dst); | |
| } | |
| // ----------------------------------------------------------------------------- | |
| // Workload 3: Complex multiply + FIR convolution | |
| // ----------------------------------------------------------------------------- | |
| static double complex_mul_scalar(const ComplexSoA &a, | |
| const ComplexSoA &b, | |
| ComplexSoA &out) | |
| { | |
| const std::size_t n = a.size(); | |
| double sum = 0.0; | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| const float ar = a.re[i]; | |
| const float ai = a.im[i]; | |
| const float br = b.re[i]; | |
| const float bi = b.im[i]; | |
| const float cr = ar * br - ai * bi; | |
| const float ci = ar * bi + ai * br; | |
| out.re[i] = cr; | |
| out.im[i] = ci; | |
| sum += static_cast<double>(cr) + static_cast<double>(ci); | |
| } | |
| return sum; | |
| } | |
| static double complex_mul_avx(const ComplexSoA &a, | |
| const ComplexSoA &b, | |
| ComplexSoA &out) | |
| { | |
| const std::size_t n = a.size(); | |
| const float *ar = a.re.data(); | |
| const float *ai = a.im.data(); | |
| const float *br = b.re.data(); | |
| const float *bi = b.im.data(); | |
| float *or_ = out.re.data(); | |
| float *oi = out.im.data(); | |
| __m256 acc_re = _mm256_setzero_ps(); | |
| __m256 acc_im = _mm256_setzero_ps(); | |
| std::size_t i = 0; | |
| alignas(32) float tmp_re[8]; | |
| alignas(32) float tmp_im[8]; | |
| for (; i + 8 <= n; i += 8) | |
| { | |
| __m256 ar_v = _mm256_loadu_ps(ar + i); | |
| __m256 ai_v = _mm256_loadu_ps(ai + i); | |
| __m256 br_v = _mm256_loadu_ps(br + i); | |
| __m256 bi_v = _mm256_loadu_ps(bi + i); | |
| __m256 arbr = _mm256_mul_ps(ar_v, br_v); | |
| __m256 aibi = _mm256_mul_ps(ai_v, bi_v); | |
| __m256 arbi = _mm256_mul_ps(ar_v, bi_v); | |
| __m256 aibr = _mm256_mul_ps(ai_v, br_v); | |
| __m256 cr = _mm256_sub_ps(arbr, aibi); | |
| __m256 ci = _mm256_add_ps(arbi, aibr); | |
| _mm256_storeu_ps(or_ + i, cr); | |
| _mm256_storeu_ps(oi + i, ci); | |
| acc_re = _mm256_add_ps(acc_re, cr); | |
| acc_im = _mm256_add_ps(acc_im, ci); | |
| } | |
| _mm256_storeu_ps(tmp_re, acc_re); | |
| _mm256_storeu_ps(tmp_im, acc_im); | |
| double sum = 0.0; | |
| for (int j = 0; j < 8; ++j) | |
| sum += static_cast<double>(tmp_re[j]) + static_cast<double>(tmp_im[j]); | |
| for (; i < n; ++i) | |
| { | |
| const float ar_ = ar[i]; | |
| const float ai_ = ai[i]; | |
| const float br_ = br[i]; | |
| const float bi_ = bi[i]; | |
| const float cr = ar_ * br_ - ai_ * bi_; | |
| const float ci = ar_ * bi_ + ai_ * br_; | |
| or_[i] = cr; | |
| oi[i] = ci; | |
| sum += static_cast<double>(cr) + static_cast<double>(ci); | |
| } | |
| return sum; | |
| } | |
| static double complex_fir_scalar(const ComplexSoA &in, | |
| const std::vector<f32> &h, | |
| ComplexSoA &out) | |
| { | |
| const std::size_t n = in.size(); | |
| const int taps = static_cast<int>(h.size()); | |
| double sum = 0.0; | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| float acc_re = 0.0f; | |
| float acc_im = 0.0f; | |
| const std::size_t limit = (i + 1 < static_cast<std::size_t>(taps)) | |
| ? (i + 1) | |
| : static_cast<std::size_t>(taps); | |
| for (std::size_t k = 0; k < limit; ++k) | |
| { | |
| const float coeff = h[static_cast<int>(k)]; | |
| const std::size_t idx = i - k; | |
| acc_re += coeff * in.re[idx]; | |
| acc_im += coeff * in.im[idx]; | |
| } | |
| out.re[i] = acc_re; | |
| out.im[i] = acc_im; | |
| sum += static_cast<double>(acc_re) + static_cast<double>(acc_im); | |
| } | |
| return sum; | |
| } | |
| static double complex_fir_avx(const ComplexSoA &in, | |
| const std::vector<f32> &h, | |
| ComplexSoA &out) | |
| { | |
| const std::size_t n = in.size(); | |
| const int taps = static_cast<int>(h.size()); | |
| const float *re = in.re.data(); | |
| const float *im = in.im.data(); | |
| float *ore = out.re.data(); | |
| float *oim = out.im.data(); | |
| double sum = 0.0; | |
| // Head region: scalar (insufficient history for a full vector block) | |
| const std::size_t start = static_cast<std::size_t>(taps - 1); | |
| for (std::size_t i = 0; i < std::min(start, n); ++i) | |
| { | |
| float acc_re = 0.0f; | |
| float acc_im = 0.0f; | |
| const std::size_t limit = (i + 1 < static_cast<std::size_t>(taps)) | |
| ? (i + 1) | |
| : static_cast<std::size_t>(taps); | |
| for (std::size_t k = 0; k < limit; ++k) | |
| { | |
| const float coeff = h[static_cast<int>(k)]; | |
| const std::size_t idx = i - k; | |
| acc_re += coeff * re[idx]; | |
| acc_im += coeff * im[idx]; | |
| } | |
| ore[i] = acc_re; | |
| oim[i] = acc_im; | |
| sum += static_cast<double>(acc_re) + static_cast<double>(acc_im); | |
| } | |
| // Vectorised interior | |
| std::size_t i = start; | |
| alignas(32) float tmp_re[8]; | |
| alignas(32) float tmp_im[8]; | |
| for (; i + 8 <= n; i += 8) | |
| { | |
| __m256 acc_re = _mm256_setzero_ps(); | |
| __m256 acc_im = _mm256_setzero_ps(); | |
| for (int k = 0; k < taps; ++k) | |
| { | |
| const float coeff = h[k]; | |
| const float *pre = re + i - k; | |
| const float *pim = im + i - k; | |
| __m256 vcoeff = _mm256_set1_ps(coeff); | |
| __m256 vre = _mm256_loadu_ps(pre); | |
| __m256 vim = _mm256_loadu_ps(pim); | |
| acc_re = _mm256_add_ps(acc_re, _mm256_mul_ps(vcoeff, vre)); | |
| acc_im = _mm256_add_ps(acc_im, _mm256_mul_ps(vcoeff, vim)); | |
| } | |
| _mm256_storeu_ps(ore + i, acc_re); | |
| _mm256_storeu_ps(oim + i, acc_im); | |
| _mm256_storeu_ps(tmp_re, acc_re); | |
| _mm256_storeu_ps(tmp_im, acc_im); | |
| for (int j = 0; j < 8; ++j) | |
| sum += static_cast<double>(tmp_re[j]) + static_cast<double>(tmp_im[j]); | |
| } | |
| // Tail region: scalar | |
| for (; i < n; ++i) | |
| { | |
| float acc_re = 0.0f; | |
| float acc_im = 0.0f; | |
| for (int k = 0; k < taps; ++k) | |
| { | |
| if (i < static_cast<std::size_t>(k)) | |
| break; | |
| const std::size_t idx = i - static_cast<std::size_t>(k); | |
| const float coeff = h[k]; | |
| acc_re += coeff * re[idx]; | |
| acc_im += coeff * im[idx]; | |
| } | |
| ore[i] = acc_re; | |
| oim[i] = acc_im; | |
| sum += static_cast<double>(acc_re) + static_cast<double>(acc_im); | |
| } | |
| return sum; | |
| } | |
| // ----------------------------------------------------------------------------- | |
| // Workload 4: Soft clip / limiter on FIR output | |
| // ----------------------------------------------------------------------------- | |
| static inline float soft_clip_scalar_sample(float x, float threshold) | |
| { | |
| const float t = threshold; | |
| if (x <= -t) | |
| return -t; | |
| if (x >= t) | |
| return t; | |
| const float x2 = x * x; | |
| const float x3 = x2 * x; | |
| const float t2 = t * t; | |
| return x - x3 / t2; | |
| } | |
| static double soft_clip_scalar(const std::vector<f32> &in, | |
| std::vector<f32> &out, | |
| float threshold) | |
| { | |
| const std::size_t n = in.size(); | |
| double sum = 0.0; | |
| for (std::size_t i = 0; i < n; ++i) | |
| { | |
| const float y = soft_clip_scalar_sample(in[i], threshold); | |
| out[i] = y; | |
| sum += static_cast<double>(y); | |
| } | |
| return sum; | |
| } | |
| static double soft_clip_avx(const std::vector<f32> &in, | |
| std::vector<f32> &out, | |
| float threshold) | |
| { | |
| const std::size_t n = in.size(); | |
| const float *pin = in.data(); | |
| float *pout = out.data(); | |
| const __m256 vth = _mm256_set1_ps(threshold); | |
| const __m256 vmin = _mm256_sub_ps(_mm256_setzero_ps(), vth); // -threshold | |
| const __m256 vt2 = _mm256_set1_ps(threshold * threshold); | |
| __m256 vacc = _mm256_setzero_ps(); | |
| std::size_t i = 0; | |
| alignas(32) float tmp[8]; | |
| for (; i + 8 <= n; i += 8) | |
| { | |
| __m256 x = _mm256_loadu_ps(pin + i); | |
| __m256 x1 = _mm256_min_ps(_mm256_max_ps(x, vmin), vth); // clamped | |
| __m256 x2 = _mm256_mul_ps(x1, x1); | |
| __m256 x3 = _mm256_mul_ps(x2, x1); | |
| __m256 frac = _mm256_div_ps(x3, vt2); | |
| __m256 y = _mm256_sub_ps(x1, frac); | |
| _mm256_storeu_ps(pout + i, y); | |
| vacc = _mm256_add_ps(vacc, y); | |
| } | |
| _mm256_storeu_ps(tmp, vacc); | |
| double sum = 0.0; | |
| for (int j = 0; j < 8; ++j) | |
| sum += static_cast<double>(tmp[j]); | |
| for (; i < n; ++i) | |
| { | |
| const float y = soft_clip_scalar_sample(pin[i], threshold); | |
| pout[i] = y; | |
| sum += static_cast<double>(y); | |
| } | |
| return sum; | |
| } | |
| // ----------------------------------------------------------------------------- | |
| // main() | |
| // ----------------------------------------------------------------------------- | |
| int main() | |
| { | |
| // ------------------------------------------------------------------------- | |
| // Workload 1: SAXPY + cosine similarity | |
| // ------------------------------------------------------------------------- | |
| std::vector<f32> x(N_SAXPY); | |
| std::vector<f32> y_scalar(N_SAXPY); | |
| std::vector<f32> y_avx(N_SAXPY); | |
| init_saxpy_vectors(x, y_scalar); | |
| y_avx = y_scalar; | |
| const float alpha = 0.5f; | |
| double t_saxpy_scalar = 0.0; | |
| double t_saxpy_avx = 0.0; | |
| double t_cos_scalar = 0.0; | |
| double t_cos_avx = 0.0; | |
| double saxpy_scalar_sum = 0.0; | |
| double saxpy_avx_sum = 0.0; | |
| double cos_scalar_val = 0.0; | |
| double cos_avx_val = 0.0; | |
| std::cout << "=== Workload 1: SAXPY + cosine similarity ===\n"; | |
| { | |
| ScopedTimer timer("saxpy_scalar", t_saxpy_scalar); | |
| saxpy_scalar_sum = saxpy_scalar(y_scalar, x, alpha); | |
| } | |
| { | |
| ScopedTimer timer("saxpy_avx", t_saxpy_avx); | |
| saxpy_avx_sum = saxpy_avx(y_avx, x, alpha); | |
| } | |
| { | |
| ScopedTimer timer("cosine_scalar", t_cos_scalar); | |
| cos_scalar_val = cosine_scalar(x, y_scalar); | |
| } | |
| { | |
| ScopedTimer timer("cosine_avx", t_cos_avx); | |
| cos_avx_val = cosine_avx(x, y_avx); | |
| } | |
| std::cout << "SAXPY scalar: checksum=" << saxpy_scalar_sum | |
| << " time=" << t_saxpy_scalar << " ms\n"; | |
| std::cout << "SAXPY AVX : checksum=" << saxpy_avx_sum | |
| << " time=" << t_saxpy_avx << " ms\n"; | |
| std::cout << "Cosine scalar: value=" << cos_scalar_val | |
| << " time=" << t_cos_scalar << " ms\n"; | |
| std::cout << "Cosine AVX : value=" << cos_avx_val | |
| << " time=" << t_cos_avx << " ms\n"; | |
| std::cout << "--------------------------------------------------------\n\n"; | |
| // ------------------------------------------------------------------------- | |
| // Workload 2: 2D 5-point blur on 1080p image | |
| // ------------------------------------------------------------------------- | |
| std::vector<f32> img_src(N_PIXELS); | |
| std::vector<f32> img_blur_scalar(N_PIXELS); | |
| std::vector<f32> img_blur_avx(N_PIXELS); | |
| init_image(img_src, WIDTH, HEIGHT); | |
| double t_blur_scalar = 0.0; | |
| double t_blur_avx = 0.0; | |
| double blur_scalar_sum = 0.0; | |
| double blur_avx_sum = 0.0; | |
| std::cout << "=== Workload 2: 2D 5-point blur on 1080p image ===\n"; | |
| { | |
| ScopedTimer timer("blur_scalar", t_blur_scalar); | |
| blur_scalar_sum = blur5_scalar(img_src, img_blur_scalar, WIDTH, HEIGHT); | |
| } | |
| { | |
| ScopedTimer timer("blur_avx", t_blur_avx); | |
| blur_avx_sum = blur5_avx(img_src, img_blur_avx, WIDTH, HEIGHT); | |
| } | |
| std::cout << "Blur scalar: checksum=" << blur_scalar_sum | |
| << " time=" << t_blur_scalar << " ms\n"; | |
| std::cout << "Blur AVX : checksum=" << blur_avx_sum | |
| << " time=" << t_blur_avx << " ms\n"; | |
| const double blur_delta = blur_avx_sum - blur_scalar_sum; | |
| std::cout << "Checksum delta (AVX - scalar): " << blur_delta << "\n"; | |
| std::cout << "--------------------------------------------------------\n\n"; | |
| // ------------------------------------------------------------------------- | |
| // Workload 3: Complex multiply + FIR convolution | |
| // ------------------------------------------------------------------------- | |
| ComplexSoA a(N_COMPLEX); | |
| ComplexSoA b(N_COMPLEX); | |
| ComplexSoA cmul_scalar(N_COMPLEX); | |
| ComplexSoA cmul_avx(N_COMPLEX); | |
| ComplexSoA fir_scalar(N_COMPLEX); | |
| ComplexSoA fir_avx(N_COMPLEX); | |
| fill_complex(a, 0x1234abcdU); | |
| fill_complex(b, 0x9876fedcU); | |
| const std::vector<f32> fir_kernel = make_fir_kernel(); | |
| double t_cmul_scalar = 0.0; | |
| double t_cmul_avx = 0.0; | |
| double t_fir_scalar = 0.0; | |
| double t_fir_avx = 0.0; | |
| double cmul_scalar_sum = 0.0; | |
| double cmul_avx_sum = 0.0; | |
| double fir_scalar_sum = 0.0; | |
| double fir_avx_sum = 0.0; | |
| std::cout << "=== Workload 3: Complex multiply + FIR convolution ===\n"; | |
| { | |
| ScopedTimer timer("complex_mul_scalar", t_cmul_scalar); | |
| cmul_scalar_sum = complex_mul_scalar(a, b, cmul_scalar); | |
| } | |
| { | |
| ScopedTimer timer("complex_mul_avx", t_cmul_avx); | |
| cmul_avx_sum = complex_mul_avx(a, b, cmul_avx); | |
| } | |
| { | |
| ScopedTimer timer("complex_fir_scalar", t_fir_scalar); | |
| fir_scalar_sum = complex_fir_scalar(cmul_scalar, fir_kernel, fir_scalar); | |
| } | |
| { | |
| ScopedTimer timer("complex_fir_avx", t_fir_avx); | |
| fir_avx_sum = complex_fir_avx(cmul_avx, fir_kernel, fir_avx); | |
| } | |
| std::cout << "Complex mul scalar: checksum=" << cmul_scalar_sum | |
| << " time=" << t_cmul_scalar << " ms\n"; | |
| std::cout << "Complex mul AVX : checksum=" << cmul_avx_sum | |
| << " time=" << t_cmul_avx << " ms\n"; | |
| std::cout << "FIR scalar : checksum=" << fir_scalar_sum | |
| << " time=" << t_fir_scalar << " ms\n"; | |
| std::cout << "FIR AVX : checksum=" << fir_avx_sum | |
| << " time=" << t_fir_avx << " ms\n"; | |
| const double cmul_delta = cmul_avx_sum - cmul_scalar_sum; | |
| const double fir_delta = fir_avx_sum - fir_scalar_sum; | |
| std::cout << "Delta cmul checksum (AVX - scalar): " << cmul_delta << "\n"; | |
| std::cout << "Delta FIR checksum (AVX - scalar): " << fir_delta << "\n"; | |
| std::cout << "--------------------------------------------------------\n\n"; | |
| // ------------------------------------------------------------------------- | |
| // Workload 4: Soft clip / limiter on FIR output | |
| // ------------------------------------------------------------------------- | |
| std::vector<f32> clip_in_scalar(N_COMPLEX); | |
| std::vector<f32> clip_in_avx(N_COMPLEX); | |
| std::vector<f32> clip_out_scalar(N_COMPLEX); | |
| std::vector<f32> clip_out_avx(N_COMPLEX); | |
| // Use magnitude of FIR output as input to limiter | |
| for (std::size_t i = 0; i < N_COMPLEX; ++i) | |
| { | |
| const float rs = fir_scalar.re[i]; | |
| const float is = fir_scalar.im[i]; | |
| const float ra = fir_avx.re[i]; | |
| const float ia = fir_avx.im[i]; | |
| clip_in_scalar[i] = std::sqrt(rs * rs + is * is); | |
| clip_in_avx[i] = std::sqrt(ra * ra + ia * ia); | |
| } | |
| const float clip_threshold = 1.0f; | |
| double t_clip_scalar = 0.0; | |
| double t_clip_avx = 0.0; | |
| double clip_scalar_sum = 0.0; | |
| double clip_avx_sum = 0.0; | |
| std::cout << "=== Workload 4: Soft clip / limiter on FIR output ===\n"; | |
| { | |
| ScopedTimer timer("soft_clip_scalar", t_clip_scalar); | |
| clip_scalar_sum = soft_clip_scalar(clip_in_scalar, clip_out_scalar, clip_threshold); | |
| } | |
| { | |
| ScopedTimer timer("soft_clip_avx", t_clip_avx); | |
| clip_avx_sum = soft_clip_avx(clip_in_avx, clip_out_avx, clip_threshold); | |
| } | |
| std::cout << "Soft clip scalar: checksum=" << clip_scalar_sum | |
| << " time=" << t_clip_scalar << " ms\n"; | |
| std::cout << "Soft clip AVX : checksum=" << clip_avx_sum | |
| << " time=" << t_clip_avx << " ms\n"; | |
| const double clip_delta = clip_avx_sum - clip_scalar_sum; | |
| std::cout << "Delta clip checksum (AVX - scalar): " << clip_delta << "\n"; | |
| std::cout << "\nDone.\n"; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment