Skip to content

Instantly share code, notes, and snippets.

@19h
Last active November 14, 2025 16:51
Show Gist options
  • Select an option

  • Save 19h/f4d19f6154bd966ae494d4032ccf1a32 to your computer and use it in GitHub Desktop.

Select an option

Save 19h/f4d19f6154bd966ae494d4032ccf1a32 to your computer and use it in GitHub Desktop.
ida vs angr
// avx_demo.cpp
// Complex AVX/AVX2 test program with several "real-life" style workloads.
#include <immintrin.h>
#include <chrono>
#include <cstdint>
#include <cmath>
#include <iostream>
#include <vector>
#include <cstring>
#include <string>
#if !defined(__AVX__)
# error "This demo requires AVX support (compile with -mavx)."
#endif
#if !defined(__AVX2__)
# warning "AVX2 not enabled; some integer AVX2 ops are not used in this demo."
#endif
#if !defined(__FMA__)
# warning "FMA not enabled; FMA intrinsics will be emulated by the compiler."
#endif
// Simple portable timer
struct ScopedTimer
{
using clock = std::chrono::high_resolution_clock;
std::string label;
clock::time_point start;
double &out_ms;
ScopedTimer(const std::string &lbl, double &ms_ref)
: label(lbl), start(clock::now()), out_ms(ms_ref)
{
}
~ScopedTimer()
{
auto end = clock::now();
auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
out_ms = static_cast<double>(us) / 1000.0;
}
};
// Simple deterministic pseudo-random filler (no <random> overhead)
static inline uint32_t lcg_next(uint32_t &state)
{
state = state * 1664525u + 1013904223u;
return state;
}
static void fill_random(std::vector<float> &v, uint32_t seed = 0x12345678u)
{
uint32_t s = seed;
for (float &x : v)
{
uint32_t r = lcg_next(s);
// Map to [-1.0, 1.0]
x = (static_cast<int32_t>(r) / 2147483648.0f);
}
}
// Horizontal sum of a __m256 using SSE fallbacks (per lane)
static inline float hsum256_ps(__m256 v)
{
__m128 vlow = _mm256_castps256_ps128(v);
__m128 vhigh = _mm256_extractf128_ps(v, 1);
vlow = _mm_add_ps(vlow, vhigh); // add the two 128-bit halves
__m128 shuf = _mm_movehdup_ps(vlow); // (v3,v3,v1,v1)
__m128 sums = _mm_add_ps(vlow, shuf); // (v0+v3, v1+v3, v2+v1, v3+v1)
shuf = _mm_movehl_ps(shuf, sums); // ( , , sums3, sums2)
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
// ============================================================================
// Workload 1: SAXPY and cosine similarity (vectorized finance/signal-processing)
// ============================================================================
// y = a * x + y (scalar reference)
static float saxpy_scalar(float a, const float *x, float *y, size_t n)
{
for (size_t i = 0; i < n; ++i)
y[i] = a * x[i] + y[i];
// checksum
float acc = 0.0f;
for (size_t i = 0; i < n; ++i)
acc += y[i];
return acc;
}
// y = a * x + y (AVX)
static float saxpy_avx(float a, const float *x, float *y, size_t n)
{
const size_t step = 8;
size_t i = 0;
__m256 av = _mm256_set1_ps(a);
for (; i + step <= n; i += step)
{
__m256 vx = _mm256_loadu_ps(x + i);
__m256 vy = _mm256_loadu_ps(y + i);
__m256 r = _mm256_fmadd_ps(av, vx, vy); // av*vx + vy
_mm256_storeu_ps(y + i, r);
}
// tail
for (; i < n; ++i)
y[i] = a * x[i] + y[i];
float acc = 0.0f;
for (size_t j = 0; j < n; ++j)
acc += y[j];
return acc;
}
// Cosine similarity scalar
static float cosine_similarity_scalar(const float *x, const float *y, size_t n)
{
double dot = 0.0;
double nx = 0.0;
double ny = 0.0;
for (size_t i = 0; i < n; ++i)
{
double xi = x[i];
double yi = y[i];
dot += xi * yi;
nx += xi * xi;
ny += yi * yi;
}
double denom = std::sqrt(nx * ny);
if (denom == 0.0)
return 0.0f;
return static_cast<float>(dot / denom);
}
// Cosine similarity AVX
static float cosine_similarity_avx(const float *x, const float *y, size_t n)
{
const size_t step = 8;
size_t i = 0;
__m256 dotv = _mm256_setzero_ps();
__m256 nxv = _mm256_setzero_ps();
__m256 nyv = _mm256_setzero_ps();
for (; i + step <= n; i += step)
{
__m256 vx = _mm256_loadu_ps(x + i);
__m256 vy = _mm256_loadu_ps(y + i);
__m256 prod = _mm256_mul_ps(vx, vy);
dotv = _mm256_add_ps(dotv, prod);
nxv = _mm256_fmadd_ps(vx, vx, nxv);
nyv = _mm256_fmadd_ps(vy, vy, nyv);
}
float dot = hsum256_ps(dotv);
float nx = hsum256_ps(nxv);
float ny = hsum256_ps(nyv);
for (; i < n; ++i)
{
float xi = x[i];
float yi = y[i];
dot += xi * yi;
nx += xi * xi;
ny += yi * yi;
}
double denom = std::sqrt(static_cast<double>(nx) * static_cast<double>(ny));
if (denom == 0.0)
return 0.0f;
return static_cast<float>(dot / denom);
}
// ============================================================================
// Workload 2: 2D image blur on float "image" (e.g., 1080p grayscale)
// new[y,x] = 0.2 * (center + left + right + up + down)
// ============================================================================
struct ImageF
{
int w;
int h;
std::vector<float> data;
ImageF(int width, int height)
: w(width), h(height), data(static_cast<size_t>(width) * height)
{
}
float *row(int y) { return data.data() + static_cast<size_t>(y) * w; }
const float *row(int y) const { return data.data() + static_cast<size_t>(y) * w; }
};
// Scalar blur
static float blur5_scalar(const ImageF &src, ImageF &dst)
{
const int w = src.w;
const int h = src.h;
// Copy border as-is
std::memcpy(dst.row(0), src.row(0), sizeof(float) * w);
std::memcpy(dst.row(h - 1), src.row(h - 1), sizeof(float) * w);
for (int y = 1; y < h - 1; ++y)
{
dst.row(y)[0] = src.row(y)[0];
dst.row(y)[w - 1] = src.row(y)[w - 1];
}
float checksum = 0.0f;
for (int y = 1; y < h - 1; ++y)
{
const float *row_c = src.row(y);
const float *row_u = src.row(y - 1);
const float *row_d = src.row(y + 1);
float *row_o = dst.row(y);
for (int x = 1; x < w - 1; ++x)
{
float val = row_c[x]
+ row_c[x - 1]
+ row_c[x + 1]
+ row_u[x]
+ row_d[x];
val *= 0.2f;
row_o[x] = val;
checksum += val;
}
}
return checksum;
}
// AVX blur: processes interior pixels in 8-wide blocks
static float blur5_avx(const ImageF &src, ImageF &dst)
{
const int w = src.w;
const int h = src.h;
std::memcpy(dst.row(0), src.row(0), sizeof(float) * w);
std::memcpy(dst.row(h - 1), src.row(h - 1), sizeof(float) * w);
for (int y = 1; y < h - 1; ++y)
{
dst.row(y)[0] = src.row(y)[0];
dst.row(y)[w - 1] = src.row(y)[w - 1];
}
const __m256 scale = _mm256_set1_ps(0.2f);
float checksum = 0.0f;
for (int y = 1; y < h - 1; ++y)
{
const float *row_c = src.row(y);
const float *row_u = src.row(y - 1);
const float *row_d = src.row(y + 1);
float *row_o = dst.row(y);
int x = 1;
const int max_x = w - 1;
const int vec_end = max_x - 8 + 1; // last x for a full 8-wide block
for (; x <= vec_end; x += 8)
{
__m256 vc = _mm256_loadu_ps(row_c + x);
__m256 vl = _mm256_loadu_ps(row_c + x - 1);
__m256 vr = _mm256_loadu_ps(row_c + x + 1);
__m256 vu = _mm256_loadu_ps(row_u + x);
__m256 vd = _mm256_loadu_ps(row_d + x);
__m256 sum = _mm256_add_ps(vc, vl);
sum = _mm256_add_ps(sum, vr);
sum = _mm256_add_ps(sum, vu);
sum = _mm256_add_ps(sum, vd);
sum = _mm256_mul_ps(sum, scale);
_mm256_storeu_ps(row_o + x, sum);
checksum += hsum256_ps(sum);
}
// tail in this row
for (; x < max_x; ++x)
{
float val = row_c[x]
+ row_c[x - 1]
+ row_c[x + 1]
+ row_u[x]
+ row_d[x];
val *= 0.2f;
row_o[x] = val;
checksum += val;
}
}
return checksum;
}
// ============================================================================
// Workload 3: Complex multiply + FIR convolution with AVX/FMA
// Complex data in SoA form (real[] and imag[] arrays)
// ============================================================================
struct ComplexSoA
{
std::vector<float> re;
std::vector<float> im;
ComplexSoA(size_t n = 0)
: re(n), im(n)
{
}
void resize(size_t n)
{
re.resize(n);
im.resize(n);
}
size_t size() const { return re.size(); }
};
static void fill_complex(ComplexSoA &c, uint32_t seed = 0xCAFEBABEu)
{
uint32_t s = seed;
for (size_t i = 0; i < c.size(); ++i)
{
uint32_t r1 = lcg_next(s);
uint32_t r2 = lcg_next(s);
c.re[i] = (static_cast<int32_t>(r1) / 2147483648.0f);
c.im[i] = (static_cast<int32_t>(r2) / 2147483648.0f);
}
}
// Complex multiply scalar: out = a * b
static float complex_mul_scalar(const ComplexSoA &a, const ComplexSoA &b, ComplexSoA &out)
{
const size_t n = a.size();
out.resize(n);
float checksum = 0.0f;
for (size_t i = 0; i < n; ++i)
{
float ar = a.re[i];
float ai = a.im[i];
float br = b.re[i];
float bi = b.im[i];
float zr = ar * br - ai * bi;
float zi = ar * bi + ai * br;
out.re[i] = zr;
out.im[i] = zi;
checksum += zr * 0.5f + zi * 0.25f;
}
return checksum;
}
// Complex multiply AVX/FMA: out = a * b (SoA)
static float complex_mul_avx(const ComplexSoA &a, const ComplexSoA &b, ComplexSoA &out)
{
const size_t n = a.size();
out.resize(n);
const size_t step = 8;
size_t i = 0;
float checksum = 0.0f;
for (; i + step <= n; i += step)
{
__m256 ar = _mm256_loadu_ps(a.re.data() + i);
__m256 ai = _mm256_loadu_ps(a.im.data() + i);
__m256 br = _mm256_loadu_ps(b.re.data() + i);
__m256 bi = _mm256_loadu_ps(b.im.data() + i);
// zr = ar*br - ai*bi
__m256 zr = _mm256_fmsub_ps(ar, br, _mm256_mul_ps(ai, bi));
// zi = ar*bi + ai*br
__m256 zi = _mm256_fmadd_ps(ar, bi, _mm256_mul_ps(ai, br));
_mm256_storeu_ps(out.re.data() + i, zr);
_mm256_storeu_ps(out.im.data() + i, zi);
// simple checksum: linear combination
__m256 c1 = _mm256_set1_ps(0.5f);
__m256 c2 = _mm256_set1_ps(0.25f);
__m256 tmp = _mm256_add_ps(_mm256_mul_ps(zr, c1), _mm256_mul_ps(zi, c2));
checksum += hsum256_ps(tmp);
}
for (; i < n; ++i)
{
float ar = a.re[i];
float ai = a.im[i];
float br = b.re[i];
float bi = b.im[i];
float zr = ar * br - ai * bi;
float zi = ar * bi + ai * br;
out.re[i] = zr;
out.im[i] = zi;
checksum += zr * 0.5f + zi * 0.25f;
}
return checksum;
}
// FIR convolution (scalar) on ComplexSoA:
// y[k] = sum_{i=0..L-1} h[i] * x[k-i], real-valued taps h.
static float complex_fir_scalar(const ComplexSoA &x, const std::vector<float> &h, ComplexSoA &y)
{
const size_t n = x.size();
const size_t L = h.size();
y.resize(n);
float checksum = 0.0f;
for (size_t k = 0; k < n; ++k)
{
float acc_re = 0.0f;
float acc_im = 0.0f;
for (size_t i = 0; i < L; ++i)
{
if (k < i)
break;
float tap = h[i];
size_t idx = k - i;
acc_re += tap * x.re[idx];
acc_im += tap * x.im[idx];
}
y.re[k] = acc_re;
y.im[k] = acc_im;
checksum += acc_re * 0.75f + acc_im * 0.33f;
}
return checksum;
}
// FIR convolution AVX on ComplexSoA with real taps, unrolled over taps
// Uses AVX for inner products over 8 samples at a time.
static float complex_fir_avx(const ComplexSoA &x, const std::vector<float> &h, ComplexSoA &y)
{
const size_t n = x.size();
const size_t L = h.size();
y.resize(n);
float checksum = 0.0f;
for (size_t k = 0; k < n; ++k)
{
__m256 acc_re_vec = _mm256_setzero_ps();
__m256 acc_im_vec = _mm256_setzero_ps();
size_t i = 0;
// Vectorized over taps in chunks of 8, but respecting bounds k-i >= 0.
for (; i + 8 <= L; i += 8)
{
if (k + 1 < i + 8)
break; // would underflow indexes
// taps h[i..i+7]
__m256 ht = _mm256_loadu_ps(h.data() + i);
// indices x[k-i], reversed window:
// idx0 = k-i
// idx1 = k-(i+1)
// ...
// This reverse pattern is not contiguous, so here we use a simplified
// strategy: approximate by convolving over forward indexes when possible.
// For a realistic case you'd pre-reverse h or x into a contiguous buffer.
size_t base = k - i - 7;
if (base + 8 > n)
continue;
__m256 xr = _mm256_loadu_ps(x.re.data() + base);
__m256 xi = _mm256_loadu_ps(x.im.data() + base);
acc_re_vec = _mm256_fmadd_ps(xr, ht, acc_re_vec);
acc_im_vec = _mm256_fmadd_ps(xi, ht, acc_im_vec);
}
float acc_re = hsum256_ps(acc_re_vec);
float acc_im = hsum256_ps(acc_im_vec);
// scalar remainder over taps (including underflow-safe region)
for (; i < L; ++i)
{
if (k < i)
break;
float tap = h[i];
size_t idx = k - i;
acc_re += tap * x.re[idx];
acc_im += tap * x.im[idx];
}
y.re[k] = acc_re;
y.im[k] = acc_im;
checksum += acc_re * 0.75f + acc_im * 0.33f;
}
return checksum;
}
// ============================================================================
// Utility: soft clip / limiter using AVX compare + blend
// y = clip(x, -threshold, +threshold)
// ============================================================================
static float soft_clip_scalar(float *x, size_t n, float threshold)
{
float sum = 0.0f;
for (size_t i = 0; i < n; ++i)
{
float v = x[i];
if (v > threshold)
v = threshold;
else if (v < -threshold)
v = -threshold;
x[i] = v;
sum += v;
}
return sum;
}
static float soft_clip_avx(float *x, size_t n, float threshold)
{
const size_t step = 8;
size_t i = 0;
__m256 th = _mm256_set1_ps(threshold);
__m256 nth = _mm256_set1_ps(-threshold);
float sum = 0.0f;
for (; i + step <= n; i += step)
{
__m256 v = _mm256_loadu_ps(x + i);
// clamp high: v = min(v, th)
__m256 v_hi = _mm256_min_ps(v, th);
// clamp low: v = max(v_hi, -th)
__m256 v_clamped = _mm256_max_ps(v_hi, nth);
_mm256_storeu_ps(x + i, v_clamped);
sum += hsum256_ps(v_clamped);
}
for (; i < n; ++i)
{
float v = x[i];
if (v > threshold)
v = threshold;
else if (v < -threshold)
v = -threshold;
x[i] = v;
sum += v;
}
return sum;
}
// ============================================================================
// Main
// ============================================================================
int main()
{
// 1) Vector workloads: SAXPY + cosine similarity
const size_t N_vec = 1u << 22; // 4M floats (~16 MiB per vector)
std::vector<float> x(N_vec), y(N_vec), y2(N_vec), z(N_vec), z2(N_vec);
fill_random(x, 0x11111111u);
fill_random(y, 0x22222222u);
fill_random(z, 0x33333333u);
z2 = z; // copy for AVX path
std::cout << "=== Workload 1: SAXPY + cosine similarity ===\n";
double ms_saxpy_scalar = 0.0, ms_saxpy_avx = 0.0;
float saxpy_cs_scalar = 0.0f, saxpy_cs_avx = 0.0f;
{
y2 = y;
ScopedTimer t("saxpy_scalar", ms_saxpy_scalar);
saxpy_cs_scalar = saxpy_scalar(1.2345f, x.data(), y2.data(), N_vec);
}
{
y2 = y;
ScopedTimer t("saxpy_avx", ms_saxpy_avx);
saxpy_cs_avx = saxpy_avx(1.2345f, x.data(), y2.data(), N_vec);
}
double ms_cos_scalar = 0.0, ms_cos_avx = 0.0;
float cos_scalar = 0.0f, cos_avx = 0.0f;
{
ScopedTimer t("cosine_scalar", ms_cos_scalar);
cos_scalar = cosine_similarity_scalar(x.data(), z.data(), N_vec);
}
{
ScopedTimer t("cosine_avx", ms_cos_avx);
cos_avx = cosine_similarity_avx(x.data(), z2.data(), N_vec);
}
std::cout << "SAXPY scalar: checksum=" << saxpy_cs_scalar << " time=" << ms_saxpy_scalar << " ms\n";
std::cout << "SAXPY AVX : checksum=" << saxpy_cs_avx << " time=" << ms_saxpy_avx << " ms\n";
std::cout << "Cosine scalar: value=" << cos_scalar << " time=" << ms_cos_scalar << " ms\n";
std::cout << "Cosine AVX : value=" << cos_avx << " time=" << ms_cos_avx << " ms\n";
std::cout << "--------------------------------------------------------\n\n";
// 2) Image blur (1080p)
const int W = 1920;
const int H = 1080;
ImageF img(W, H);
ImageF blur_ref(W, H);
ImageF blur_avx(W, H);
fill_random(img.data, 0xA5A5A5A5u);
std::cout << "=== Workload 2: 2D 5-point blur on 1080p image ===\n";
double ms_blur_scalar = 0.0, ms_blur_avx = 0.0;
float cs_blur_scalar = 0.0f, cs_blur_avx = 0.0f;
{
ScopedTimer t("blur_scalar", ms_blur_scalar);
cs_blur_scalar = blur5_scalar(img, blur_ref);
}
{
ScopedTimer t("blur_avx", ms_blur_avx);
cs_blur_avx = blur5_avx(img, blur_avx);
}
std::cout << "Blur scalar: checksum=" << cs_blur_scalar << " time=" << ms_blur_scalar << " ms\n";
std::cout << "Blur AVX : checksum=" << cs_blur_avx << " time=" << ms_blur_avx << " ms\n";
// Quick consistency check: difference in checksum
std::cout << "Checksum delta (AVX - scalar): "
<< (cs_blur_avx - cs_blur_scalar) << "\n";
std::cout << "--------------------------------------------------------\n\n";
// 3) Complex workloads
const size_t N_cplx = 1u << 18; // 262,144 complex samples
ComplexSoA a(N_cplx), b(N_cplx), c_ref, c_avx, fir_ref, fir_avx;
fill_complex(a, 0x1234ABCDu);
fill_complex(b, 0x9876FEDCu);
// FIR taps (e.g. 16-tap low-pass prototype)
const size_t L = 16;
std::vector<float> taps(L);
for (size_t i = 0; i < L; ++i)
{
// simple symmetric shape
float x_rel = (static_cast<float>(i) - (L - 1) / 2.0f) / (L / 2.0f);
float win = 0.5f - 0.5f * std::cos(3.14159265358979323846f * (i + 0.5f) / L);
taps[i] = win * std::exp(-x_rel * x_rel);
}
std::cout << "=== Workload 3: Complex multiply + FIR convolution ===\n";
double ms_cmul_scalar = 0.0, ms_cmul_avx = 0.0;
float cs_cmul_scalar = 0.0f, cs_cmul_avx = 0.0f;
{
ScopedTimer t("complex_mul_scalar", ms_cmul_scalar);
cs_cmul_scalar = complex_mul_scalar(a, b, c_ref);
}
{
ScopedTimer t("complex_mul_avx", ms_cmul_avx);
cs_cmul_avx = complex_mul_avx(a, b, c_avx);
}
double ms_fir_scalar = 0.0, ms_fir_avx = 0.0;
float cs_fir_scalar = 0.0f, cs_fir_avx = 0.0f;
{
ScopedTimer t("complex_fir_scalar", ms_fir_scalar);
cs_fir_scalar = complex_fir_scalar(a, taps, fir_ref);
}
{
ScopedTimer t("complex_fir_avx", ms_fir_avx);
cs_fir_avx = complex_fir_avx(a, taps, fir_avx);
}
std::cout << "Complex mul scalar: checksum=" << cs_cmul_scalar << " time=" << ms_cmul_scalar << " ms\n";
std::cout << "Complex mul AVX : checksum=" << cs_cmul_avx << " time=" << ms_cmul_avx << " ms\n";
std::cout << "FIR scalar : checksum=" << cs_fir_scalar << " time=" << ms_fir_scalar << " ms\n";
std::cout << "FIR AVX : checksum=" << cs_fir_avx << " time=" << ms_fir_avx << " ms\n";
std::cout << "Delta cmul checksum (AVX - scalar): " << (cs_cmul_avx - cs_cmul_scalar) << "\n";
std::cout << "Delta FIR checksum (AVX - scalar): " << (cs_fir_avx - cs_fir_scalar) << "\n";
std::cout << "--------------------------------------------------------\n\n";
// 4) Soft clipping on FIR output (just to exercise AVX clamp / min / max)
std::cout << "=== Workload 4: Soft clip / limiter on FIR output ===\n";
double ms_clip_scalar = 0.0, ms_clip_avx = 0.0;
float cs_clip_scalar = 0.0f, cs_clip_avx = 0.0f;
// Pack FIR real part into separate working buffer
std::vector<float> fir_real = fir_ref.re;
std::vector<float> fir_real2 = fir_real;
{
ScopedTimer t("soft_clip_scalar", ms_clip_scalar);
cs_clip_scalar = soft_clip_scalar(fir_real.data(), fir_real.size(), 0.8f);
}
{
ScopedTimer t("soft_clip_avx", ms_clip_avx);
cs_clip_avx = soft_clip_avx(fir_real2.data(), fir_real2.size(), 0.8f);
}
std::cout << "Soft clip scalar: checksum=" << cs_clip_scalar << " time=" << ms_clip_scalar << " ms\n";
std::cout << "Soft clip AVX : checksum=" << cs_clip_avx << " time=" << ms_clip_avx << " ms\n";
std::cout << "Delta clip checksum (AVX - scalar): "
<< (cs_clip_avx - cs_clip_scalar) << "\n";
std::cout << "\nDone.\n";
return 0;
}
int __fastcall main(int argc, const char **argv, const char **envp)
{
__int64 v9; // rdx
__int64 v10; // rdx
__int64 v11; // rdx
__int64 v12; // rdx
__int64 v14; // rcx
int v17; // eax
__int64 v20; // rcx
int v21; // eax
__int64 v24; // rcx
int v26; // eax
__int64 v34; // rdx
__int64 v50; // rdx
__int64 v62; // rax
double v110; // xmm4_8
__int64 v113; // rbx
double v114; // xmm4_8
__int64 v116; // rax
double v117; // xmm4_8
__int64 v120; // rbx
double v121; // xmm4_8
__int64 v123; // rax
double v124; // xmm4_8
__int64 v127; // rbx
double v128; // xmm4_8
__int64 v130; // rax
double v131; // xmm4_8
__int64 v134; // rbx
double v135; // xmm4_8
__int64 v137; // rax
__int64 v138; // rdx
__int64 v139; // rdx
__int64 v140; // rdx
char *v142; // rcx
int v144; // eax
int v146; // ebx
size_t v147; // r14
int v148; // ecx
__int64 v149; // rbx
char *v150; // r12
int v151; // r15d
int v153; // ecx
__int64 v157; // r15
unsigned int v158; // r11d
char *v160; // rsi
char *v161; // rdx
size_t v162; // r12
__int64 v163; // r14
int v187; // r9d
__int64 v188; // rdi
int v189; // r10d
signed int v202; // edi
int v218; // edi
__int64 v235; // r15
int v236; // ecx
__int64 v240; // rbx
__int64 v243; // r15
__int64 v244; // r9
int v260; // r11d
int v262; // r10d
int v265; // edi
__int64 v278; // rdi
int v279; // r10d
int v280; // r9d
double v305; // xmm4_8
__int64 v307; // rbx
double v308; // xmm4_8
__int64 v310; // rax
double v311; // xmm4_8
__int64 v313; // rbx
double v314; // xmm4_8
__int64 v316; // rax
double v317; // xmm4_8
__int64 v321; // rax
__int64 v322; // rdx
__int64 v340; // r15
unsigned __int64 v341; // r13
unsigned __int64 v370; // r9
__int64 v443; // rcx
unsigned __int64 v444; // r12
bool v445; // cf
bool v446; // zf
bool v448; // r11
unsigned __int64 v449; // r14
unsigned __int64 v455; // r11
unsigned __int64 v456; // rdx
unsigned __int64 v520; // r15
unsigned __int64 v521; // r14
__int64 v526; // rdx
unsigned __int64 v533; // rdi
__int64 v557; // rdx
double v559; // xmm4_8
__int64 v561; // rbx
double v562; // xmm4_8
__int64 v564; // rax
double v565; // xmm4_8
__int64 v567; // rbx
double v568; // xmm4_8
__int64 v570; // rax
double v571; // xmm4_8
__int64 v573; // rbx
double v574; // xmm4_8
__int64 v576; // rax
double v577; // xmm4_8
__int64 v579; // rbx
double v580; // xmm4_8
__int64 v582; // rax
double v583; // xmm4_8
__int64 v587; // rax
double v588; // xmm4_8
__int64 v592; // rax
__int64 v594; // rsi
unsigned __int64 v595; // rcx
unsigned __int64 v596; // rdi
__int64 v597; // rax
unsigned __int64 v598; // rdx
__int64 v603; // rcx
unsigned __int64 v623; // rcx
unsigned __int64 v624; // rax
unsigned __int64 v625; // rdx
bool v626; // cc
__int64 v642; // rax
unsigned __int64 v647; // rcx
unsigned __int64 v652; // rsi
unsigned __int64 v665; // rdx
unsigned __int64 v681; // rdi
unsigned __int64 v685; // r8
unsigned __int64 v691; // rdx
bool v692; // cc
int v698; // edx
__int64 v709; // rdx
double v716; // xmm4_8
__int64 v718; // rbx
double v719; // xmm4_8
__int64 v721; // rax
double v722; // xmm4_8
__int64 v724; // rbx
double v725; // xmm4_8
__int64 v727; // rax
double v728; // xmm4_8
__int64 v732; // rax
int v854; // ecx
int v860; // esi
char *v864; // [rsp-490h] [rbp-490h]
__int64 v865; // [rsp-488h] [rbp-488h]
unsigned int v866; // [rsp-450h] [rbp-450h]
int v867; // [rsp-44Ch] [rbp-44Ch]
int v868; // [rsp-438h] [rbp-438h]
__int64 v869; // [rsp-438h] [rbp-438h]
unsigned __int64 v870; // [rsp-438h] [rbp-438h]
char *v871; // [rsp-430h] [rbp-430h]
int v872; // [rsp-430h] [rbp-430h]
__int64 v873; // [rsp-428h] [rbp-428h]
__int64 v874; // [rsp-428h] [rbp-428h]
__int64 v875; // [rsp-420h] [rbp-420h]
__int64 v876; // [rsp-420h] [rbp-420h]
int v877; // [rsp-418h] [rbp-418h]
__int64 v878; // [rsp-418h] [rbp-418h]
__int64 v879; // [rsp-410h] [rbp-410h]
int v880; // [rsp-408h] [rbp-408h]
unsigned __int64 v881; // [rsp-408h] [rbp-408h]
__int64 v882; // [rsp-400h] [rbp-400h]
int v883; // [rsp-3F0h] [rbp-3F0h]
unsigned int v884; // [rsp-3E8h] [rbp-3E8h]
__int64 v885; // [rsp-3E0h] [rbp-3E0h]
int v886; // [rsp-3E0h] [rbp-3E0h]
__int64 v887; // [rsp-3D0h] [rbp-3D0h]
int v888; // [rsp-3C8h] [rbp-3C8h]
__int64 v889; // [rsp-3C0h] [rbp-3C0h]
char *v890; // [rsp-3C0h] [rbp-3C0h]
__int64 v891; // [rsp-3C0h] [rbp-3C0h]
char *v892; // [rsp-3B8h] [rbp-3B8h]
__int64 v893; // [rsp-3B8h] [rbp-3B8h]
__int64 v894; // [rsp-3B8h] [rbp-3B8h]
int v895; // [rsp-3B0h] [rbp-3B0h]
__int64 v896; // [rsp-3B0h] [rbp-3B0h]
__int64 v897; // [rsp-3B0h] [rbp-3B0h]
__int64 v898; // [rsp-3A8h] [rbp-3A8h]
__int64 v899; // [rsp-3A0h] [rbp-3A0h]
unsigned __int64 v900; // [rsp-398h] [rbp-398h]
__int64 v901; // [rsp-390h] [rbp-390h]
__int64 v902; // [rsp-380h] [rbp-380h] BYREF
__int64 v903; // [rsp-378h] [rbp-378h] BYREF
__int64 v904; // [rsp-370h] [rbp-370h] BYREF
__int64 v905; // [rsp-368h] [rbp-368h] BYREF
__int64 v906; // [rsp-360h] [rbp-360h] BYREF
__int64 v907; // [rsp-358h] [rbp-358h] BYREF
__int64 v908; // [rsp-350h] [rbp-350h] BYREF
__int64 v909; // [rsp-348h] [rbp-348h] BYREF
__int64 v910; // [rsp-340h] [rbp-340h] BYREF
__int64 v911; // [rsp-338h] [rbp-338h] BYREF
__int64 v912; // [rsp-330h] [rbp-330h] BYREF
__int64 v913; // [rsp-328h] [rbp-328h] BYREF
__int64 v914; // [rsp-320h] [rbp-320h] BYREF
__int64 v915; // [rsp-318h] [rbp-318h]
__int64 v916; // [rsp-300h] [rbp-300h] BYREF
__int64 v917; // [rsp-2F8h] [rbp-2F8h]
_QWORD v918[4]; // [rsp-2E0h] [rbp-2E0h] BYREF
__int64 v919; // [rsp-2C0h] [rbp-2C0h] BYREF
__int64 v920; // [rsp-2B8h] [rbp-2B8h]
_QWORD v921[4]; // [rsp-2A0h] [rbp-2A0h] BYREF
__int64 v922; // [rsp-280h] [rbp-280h] BYREF
__int64 v923; // [rsp-278h] [rbp-278h]
__int64 v924; // [rsp-260h] [rbp-260h] BYREF
__int64 v925; // [rsp-258h] [rbp-258h]
__int64 v926; // [rsp-240h] [rbp-240h] BYREF
__int64 v927; // [rsp-238h] [rbp-238h]
__int64 v928; // [rsp-220h] [rbp-220h]
char *v929; // [rsp-218h] [rbp-218h] BYREF
char *v930; // [rsp-210h] [rbp-210h]
__int64 v931; // [rsp-200h] [rbp-200h]
void *v932[3]; // [rsp-1F8h] [rbp-1F8h] BYREF
__int64 v933; // [rsp-1E0h] [rbp-1E0h]
void *v934[3]; // [rsp-1D8h] [rbp-1D8h] BYREF
__int64 v935; // [rsp-1C0h] [rbp-1C0h] BYREF
__int64 v936; // [rsp-1B8h] [rbp-1B8h]
__int64 v937; // [rsp-1A8h] [rbp-1A8h] BYREF
_QWORD v938[3]; // [rsp-190h] [rbp-190h] BYREF
__int64 v939; // [rsp-178h] [rbp-178h] BYREF
_QWORD v940[3]; // [rsp-160h] [rbp-160h] BYREF
_QWORD v941[3]; // [rsp-148h] [rbp-148h] BYREF
_QWORD v942[3]; // [rsp-130h] [rbp-130h] BYREF
_QWORD v943[3]; // [rsp-118h] [rbp-118h] BYREF
_QWORD v944[3]; // [rsp-100h] [rbp-100h] BYREF
_QWORD v945[3]; // [rsp-E8h] [rbp-E8h] BYREF
_QWORD v946[3]; // [rsp-D0h] [rbp-D0h] BYREF
_QWORD v947[3]; // [rsp-B8h] [rbp-B8h] BYREF
_BYTE v948[32]; // [rsp-A0h] [rbp-A0h] BYREF
_QWORD v949[14]; // [rsp-80h] [rbp-80h] BYREF
_QWORD v950[2]; // [rsp-10h] [rbp-10h] BYREF
_UNKNOWN *retaddr; // [rsp+0h] [rbp+0h]
char v952; // [rsp+8h] [rbp+8h] BYREF
v950[1] = retaddr;
_RBP = v950;
v949[9] = &v952;
v949[7] = __readfsqword(0x28u);
std::vector<float>::vector(&v914, 0x400000, envp);
std::vector<float>::vector(&v916, 0x400000, v9);
std::vector<float>::vector(v918, 0x400000, v10);
std::vector<float>::vector(&v919, 0x400000, v11);
std::vector<float>::vector(v921, 0x400000, v12);
_RBX = v914;
v14 = v915;
if ( v915 != v914 )
{
__asm { vmovss xmm1, cs:dword_6004 }
_RDX = v914;
v17 = 286331153;
do
{
__asm { vxorps xmm3, xmm3, xmm3 }
_RDX += 4;
v17 = 1664525 * v17 + 1013904223;
__asm
{
vcvtsi2ss xmm0, xmm3, eax
vmulss xmm0, xmm0, xmm1
vmovss dword ptr [rdx-4], xmm0
}
}
while ( v14 != _RDX );
}
_RDX = v916;
v20 = v917;
v21 = 572662306;
__asm { vmovss xmm1, cs:dword_6004 }
if ( v917 != v916 )
{
do
{
__asm { vxorps xmm3, xmm3, xmm3 }
_RDX += 4;
v21 = 1664525 * v21 + 1013904223;
__asm
{
vcvtsi2ss xmm0, xmm3, eax
vmulss xmm0, xmm0, xmm1
vmovss dword ptr [rdx-4], xmm0
}
}
while ( _RDX != v20 );
}
v24 = v920;
if ( v920 != v919 )
{
__asm { vmovss xmm1, cs:dword_6004 }
_RDX = v919;
v26 = 858993459;
do
{
__asm { vxorps xmm3, xmm3, xmm3 }
_RDX += 4;
v26 = 1664525 * v26 + 1013904223;
__asm
{
vcvtsi2ss xmm0, xmm3, eax
vmulss xmm0, xmm0, xmm1
vmovss dword ptr [rdx-4], xmm0
}
}
while ( v24 != _RDX );
}
std::vector<float>::operator=(v921, &v919, *(double *)&_XMM0, *(double *)&_XMM1);
std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 1: SAXPY + cosine similarity ===\n");
v902 = 0;
v903 = 0;
std::vector<float>::operator=(v918, &v916, *(double *)&_XMM0, *(double *)&_XMM1);
std::string::basic_string<std::allocator<char>>(v948, "saxpy_scalar");
ScopedTimer::ScopedTimer(v949, v948, &v902);
std::string::_M_dispose(v948);
_RAX = v918[0];
_RDX = 0;
if ( (unsigned __int64)(v918[0] - (_RBX + 4)) <= 0x18 )
{
__asm { vmovss xmm1, cs:dword_6014 }
do
{
__asm
{
vmovss xmm0, dword ptr [rbx+rdx*4]
vfmadd213ss xmm0, xmm1, dword ptr [rax+rdx*4]
vmovss dword ptr [rax+rdx*4], xmm0
}
++_RDX;
}
while ( _RDX != 0x400000 );
}
else
{
__asm { vbroadcastss ymm1, cs:dword_6014 }
do
{
__asm
{
vmovups ymm0, ymmword ptr [rbx+rdx]
vfmadd213ps ymm0, ymm1, ymmword ptr [rax+rdx]
vmovups ymmword ptr [rax+rdx], ymm0
}
_RDX += 32;
}
while ( _RDX != 0x1000000 );
__asm { vzeroupper }
}
_R13D = 0;
v34 = _RAX + 0x1000000;
__asm { vmovd xmm0, r13d }
do
{
__asm { vaddss xmm0, xmm0, dword ptr [rax] }
_RAX += 32;
__asm
{
vaddss xmm0, xmm0, dword ptr [rax-1Ch]
vaddss xmm0, xmm0, dword ptr [rax-18h]
vaddss xmm0, xmm0, dword ptr [rax-14h]
vaddss xmm0, xmm0, dword ptr [rax-10h]
vaddss xmm0, xmm0, dword ptr [rax-0Ch]
vaddss xmm0, xmm0, dword ptr [rax-8]
vaddss xmm0, xmm0, dword ptr [rax-4]
}
}
while ( _RAX != v34 );
__asm { vmovd r13d, xmm0 }
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::vector<float>::operator=(v918, &v916, *(double *)&_XMM0, *(double *)&_XMM1);
std::string::basic_string<std::allocator<char>>(v948, "saxpy_avx");
ScopedTimer::ScopedTimer(v949, v948, &v903);
std::string::_M_dispose(v948);
_RAX = v918[0];
_RDX = 8;
__asm { vbroadcastss ymm1, cs:dword_6014 }
do
{
__asm
{
vmovups ymm0, ymmword ptr [rbx+rdx*4-20h]
vfmadd213ps ymm0, ymm1, ymmword ptr [rax+rdx*4-20h]
vmovups ymmword ptr [rax+rdx*4-20h], ymm0
}
_RDX += 8;
}
while ( _RDX != 4194312 );
_R14D = 0;
v50 = _RAX + 0x1000000;
__asm { vmovd xmm0, r14d }
do
{
__asm { vaddss xmm0, xmm0, dword ptr [rax] }
_RAX += 32;
__asm
{
vaddss xmm0, xmm0, dword ptr [rax-1Ch]
vaddss xmm0, xmm0, dword ptr [rax-18h]
vaddss xmm0, xmm0, dword ptr [rax-14h]
vaddss xmm0, xmm0, dword ptr [rax-10h]
vaddss xmm0, xmm0, dword ptr [rax-0Ch]
vaddss xmm0, xmm0, dword ptr [rax-8]
vaddss xmm0, xmm0, dword ptr [rax-4]
}
}
while ( v50 != _RAX );
__asm
{
vmovd r14d, xmm0
vzeroupper
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
v904 = 0;
v905 = 0;
std::string::basic_string<std::allocator<char>>(v948, "cosine_scalar");
ScopedTimer::ScopedTimer(v949, v948, &v904);
std::string::_M_dispose(v948);
__asm { vxorpd xmm4, xmm4, xmm4 }
v62 = 0;
__asm
{
vmovsd xmm2, xmm4, xmm4
vmovsd xmm3, xmm4, xmm4
}
do
{
__asm
{
vxorpd xmm7, xmm7, xmm7
vcvtss2sd xmm1, xmm7, dword ptr [rbx+rax*4]
vcvtss2sd xmm0, xmm7, dword ptr [r12+rax*4]
}
++v62;
__asm
{
vfmadd231sd xmm3, xmm1, xmm0
vfmadd231sd xmm2, xmm1, xmm1
vfmadd231sd xmm4, xmm0, xmm0
}
}
while ( v62 != 0x400000 );
__asm
{
vmulsd xmm0, xmm2, xmm4; x
vxorpd xmm1, xmm1, xmm1
vucomisd xmm1, xmm0
}
__asm
{
vsqrtsd xmm0, xmm0, xmm0
vxorpd xmm1, xmm1, xmm1
vucomisd xmm0, xmm1
}
if ( __SETP__(v62, 0x400000) )
{
__asm
{
vdivsd xmm3, xmm3, xmm0
vcvtsd2ss xmm3, xmm3, xmm3
vmovd r12d, xmm3
}
}
else
{
_R12D = 0;
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::string::basic_string<std::allocator<char>>(v948, "cosine_avx");
ScopedTimer::ScopedTimer(v949, v948, &v905);
std::string::_M_dispose(v948);
__asm { vxorps xmm2, xmm2, xmm2 }
_RDX = v921[0];
_RAX = 0;
__asm
{
vmovaps ymm3, ymm2
vmovaps ymm4, ymm2
}
while ( 1 )
{
_RAX += 8;
if ( _RAX == 4194312 )
break;
__asm
{
vmovups ymm1, ymmword ptr [rbx+rax*4-20h]
vmovups ymm0, ymmword ptr [rdx+rax*4-20h]
vfmadd231ps ymm3, ymm1, ymm1
vfmadd231ps ymm4, ymm1, ymm0
vfmadd231ps ymm2, ymm0, ymm0
}
}
__asm
{
vmovaps xmm0, xmm4
vextractf128 xmm4, ymm4, 1
vaddps xmm4, xmm0, xmm4
vmovshdup xmm0, xmm4
vaddps xmm4, xmm4, xmm0
vmovhlps xmm0, xmm0, xmm4
vaddss xmm4, xmm4, xmm0
vmovaps xmm0, xmm3
vextractf128 xmm3, ymm3, 1
vaddps xmm0, xmm0, xmm3
vmovshdup xmm1, xmm0
vaddps xmm0, xmm0, xmm1
vmovhlps xmm1, xmm1, xmm0
vaddss xmm0, xmm0, xmm1
vmovaps xmm1, xmm2
vextractf128 xmm2, ymm2, 1
vaddps xmm1, xmm1, xmm2
vcvtss2sd xmm0, xmm0, xmm0
vmovshdup xmm2, xmm1
vaddps xmm1, xmm1, xmm2
vmovhlps xmm2, xmm2, xmm1
vaddss xmm1, xmm1, xmm2
vcvtss2sd xmm1, xmm1, xmm1
vmulsd xmm0, xmm0, xmm1; x
vxorpd xmm1, xmm1, xmm1
vucomisd xmm1, xmm0
}
__asm
{
vsqrtsd xmm0, xmm0, xmm0
vzeroupper
vxorpd xmm1, xmm1, xmm1
vucomisd xmm0, xmm1
}
if ( __SETP__(_RAX, 4194312) )
{
__asm
{
vcvtss2sd xmm4, xmm4, xmm4
vdivsd xmm0, xmm4, xmm0
vcvtsd2ss xmm3, xmm0, xmm0
vmovd r15d, xmm3
}
}
else
{
_R15D = 0;
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"SAXPY scalar: checksum=",
24,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v110);
__asm { vmovd xmm3, r13d }
__asm { vcvtss2sd xmm0, xmm3, xmm3 }
v113 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v113,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v114);
__asm { vmovsd xmm0, qword ptr [rbp-370h] }
v116 = std::ostream::_M_insert<double>(v113, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v116, " ms\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"SAXPY AVX : checksum=",
24,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v117);
__asm { vmovd xmm3, r14d }
__asm { vcvtss2sd xmm0, xmm3, xmm3 }
v120 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v120,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v121);
__asm { vmovsd xmm0, qword ptr [rbp-368h] }
v123 = std::ostream::_M_insert<double>(v120, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v123, " ms\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Cosine scalar: value=",
21,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v124);
__asm { vmovd xmm3, r12d }
__asm { vcvtss2sd xmm0, xmm3, xmm3 }
v127 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v127,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v128);
__asm { vmovsd xmm0, qword ptr [rbp-360h] }
v130 = std::ostream::_M_insert<double>(v127, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v130, " ms\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Cosine AVX : value=",
21,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v131);
__asm { vmovd xmm3, r15d }
__asm { vcvtss2sd xmm0, xmm3, xmm3 }
v134 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v134,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v135);
__asm { vmovsd xmm0, qword ptr [rbp-358h] }
v137 = std::ostream::_M_insert<double>(v134, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v137, " ms\n");
std::operator<<<std::char_traits<char>>(&std::cout, "--------------------------------------------------------\n\n");
v928 = 0x43800000780LL;
std::vector<float>::vector(&v929, 2073600, v138);
v931 = 0x43800000780LL;
std::vector<float>::vector(v932, 2073600, v139);
v933 = 0x43800000780LL;
std::vector<float>::vector(v934, 2073600, v140);
_R13 = v929;
v142 = v930;
if ( v929 != v930 )
{
__asm { vmovss xmm1, cs:dword_6004 }
_RDX = v929;
v144 = -1515870811;
do
{
__asm { vxorps xmm3, xmm3, xmm3 }
_RDX += 4;
v144 = 1664525 * v144 + 1013904223;
__asm
{
vcvtsi2ss xmm0, xmm3, eax
vmulss xmm0, xmm0, xmm1
vmovss dword ptr [rdx-4], xmm0
}
}
while ( _RDX != v142 );
}
std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 2: 2D 5-point blur on 1080p image ===\n");
v906 = 0;
v907 = 0;
std::string::basic_string<std::allocator<char>>(v948, "blur_scalar");
ScopedTimer::ScopedTimer(v949, v948, &v906);
std::string::_M_dispose(v948);
v146 = HIDWORD(v928);
v147 = 4LL * (int)v928;
LODWORD(v899) = v928;
v898 = (int)v928;
v892 = (char *)v932[0];
memcpy(v932[0], _R13, v147);
v148 = v146 - 1;
v149 = v146 - 1;
v888 = v148;
v150 = &_R13[v148 * v147];
v887 = (int)v931;
v151 = v931;
v885 = 4LL * (int)v931;
memcpy(&v892[v148 * v885], v150, v147);
if ( v888 > 1 )
{
if ( v151 == 1 && (_RDX = _R13 + 4, v153 = 1, _RAX = v892 + 4, (_DWORD)v899 == 1) )
{
do
{
__asm { vmovss xmm0, dword ptr [rdx] }
++v153;
_RDX += 4;
_RAX += 4;
__asm
{
vmovss dword ptr [rax-4], xmm0
vmovss xmm0, dword ptr [rdx-4]
vmovss dword ptr [rax-4], xmm0
}
}
while ( v888 != v153 );
v871 = &_R13[v147];
_RCX = &v892[v885];
}
else
{
_RDI = v898;
v860 = 1;
v871 = &_R13[v147];
_RDX = &_R13[v147];
_RCX = &v892[v885];
_RAX = &v892[v885];
do
{
__asm { vmovss xmm0, dword ptr [rdx] }
++v860;
__asm
{
vmovss dword ptr [rax], xmm0
vmovss xmm0, dword ptr [rdx+rdi*4-4]
}
_RDX += v147;
__asm { vmovss dword ptr [rax+rdi*4-4], xmm0 }
_RAX += v885;
}
while ( v888 != v860 );
}
LODWORD(v900) = 0;
__asm { vmovss xmm1, dword ptr cs:qword_6480 }
v865 = v149;
v895 = 1;
v884 = v899 - 3;
v157 = (unsigned int)(v899 - 3) + 2LL;
v877 = v899 - 2;
v158 = (v899 - 2) & 0xFFFFFFF8;
v880 = v899 - 1;
v875 = 32LL * ((unsigned int)(v899 - 2) >> 3);
_RAX = v871;
__asm { vbroadcastss ymm2, xmm1 }
__asm { vshufps xmm3, xmm1, xmm1, 0 }
v867 = v158 + 1;
v160 = &v871[v147];
v866 = v158;
v872 = v899 - v158;
v868 = v899 - v158 - 2;
v901 = v887;
v161 = _R13;
v864 = v150;
v162 = v147;
v163 = v898;
while ( 1 )
{
++v895;
v889 = v163;
v163 += v898;
if ( (int)v899 > 2 )
{
if ( v884 <= 2
|| (_R8 = _RCX + 4,
_R9 = _RAX + 4,
(unsigned __int64)(_RCX - _RAX) <= 0x20 || (unsigned __int64)(_RCX + 4 - (v161 + 8)) <= 0x18)
|| (unsigned __int64)(_R8 - (v160 + 8)) <= 0x18 )
{
_RDI = 1;
do
{
__asm
{
vmovss xmm0, dword ptr [rax+rdi*4]
vaddss xmm0, xmm0, dword ptr [rax+rdi*4-4]
vaddss xmm0, xmm0, dword ptr [rax+rdi*4+4]
vaddss xmm0, xmm0, dword ptr [rdx+rdi*4]
vaddss xmm0, xmm0, dword ptr [rsi+rdi*4]
vmulss xmm0, xmm0, xmm1
vaddss xmm6, xmm0, dword ptr [rbp-388h]
vmovss dword ptr [rbp-388h], xmm6
vmovss dword ptr [rcx+rdi*4], xmm0
}
++_RDI;
}
while ( _RDI != v157 );
goto LABEL_53;
}
if ( v884 <= 6 )
{
v187 = v877;
v188 = 0;
v189 = 1;
}
else
{
_RDI = 0;
do
{
__asm
{
vmovups ymm5, ymmword ptr [r9+rdi]
vaddps ymm0, ymm5, ymmword ptr [rax+rdi]
vaddps ymm0, ymm0, ymmword ptr [rbx+rdi]
vaddps ymm0, ymm0, ymmword ptr [r11+rdi]
vaddps ymm0, ymm0, ymmword ptr [r10+rdi]
vmovss xmm4, dword ptr [rbp-388h]
vmulps ymm0, ymm0, ymm2
vaddss xmm4, xmm4, xmm0
vshufps xmm6, xmm0, xmm0, 55h ; 'U'
vshufps xmm5, xmm0, xmm0, 0FFh
vmovups ymmword ptr [r8+rdi], ymm0
}
_RDI += 32;
__asm
{
vaddss xmm4, xmm4, xmm6
vunpckhps xmm6, xmm0, xmm0
vextractf128 xmm0, ymm0, 1
vaddss xmm4, xmm4, xmm6
vaddss xmm4, xmm4, xmm5
vshufps xmm5, xmm0, xmm0, 55h ; 'U'
vaddss xmm4, xmm4, xmm0
vaddss xmm4, xmm4, xmm5
vunpckhps xmm5, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm4, xmm4, xmm5
vaddss xmm5, xmm4, xmm0
vmovss dword ptr [rbp-388h], xmm5
}
}
while ( v875 != _RDI );
if ( (v877 & 7) == 0 )
goto LABEL_53;
v187 = v868;
if ( (unsigned int)(v872 - 3) <= 2 )
{
v202 = v867;
LABEL_50:
_R9 = 4LL * v202;
v882 = v202 + 1LL;
__asm { vmovss xmm0, dword ptr [rax+r9-4] }
__asm { vaddss xmm0, xmm0, dword ptr [rbx] }
_R11 = &_RAX[_R9 + 4];
__asm
{
vaddss xmm0, xmm0, dword ptr [r11]
vaddss xmm0, xmm0, dword ptr [rdx+r9]
vaddss xmm0, xmm0, dword ptr [r13+rbx*4+0]
}
__asm { vmulss xmm0, xmm0, xmm1 }
_R8 = v901 + v202;
_RBX = v892;
__asm
{
vaddss xmm4, xmm0, dword ptr [rbp-388h]
vmovss dword ptr [rbp-388h], xmm4
vmovss dword ptr [rbx+r8*4], xmm0
}
if ( v880 > v202 + 1 )
{
__asm { vmovss xmm0, dword ptr [r11] }
v218 = v202 + 2;
__asm { vaddss xmm0, xmm0, dword ptr [r8] }
__asm
{
vaddss xmm0, xmm0, dword ptr [rbx]
vaddss xmm0, xmm0, dword ptr [rdx+r8]
}
__asm { vaddss xmm0, xmm0, dword ptr [r13+rbx*4+0] }
_RBX = v882 + v901;
_R8 = v892;
__asm
{
vmulss xmm0, xmm0, xmm1
vaddss xmm5, xmm0, xmm4
vmovss dword ptr [r8+rbx*4], xmm0
vmovss dword ptr [rbp-388h], xmm5
}
if ( v880 > v218 )
{
_R8 = &_RAX[_R9 + 8];
__asm { vmovss xmm0, dword ptr [r8] }
__asm
{
vaddss xmm0, xmm0, dword ptr [r11]
vaddss xmm0, xmm0, dword ptr [rax+r9+0Ch]
vaddss xmm0, xmm0, dword ptr [rdx+rbx]
vaddss xmm0, xmm0, dword ptr [r13+r8*4+0]
}
_R8 = v892;
_RDI = v901 + v218;
__asm
{
vmulss xmm0, xmm0, xmm1
vaddss xmm4, xmm0, xmm5
vmovss dword ptr [r8+rdi*4], xmm0
vmovss dword ptr [rbp-388h], xmm4
}
}
}
goto LABEL_53;
}
v188 = v866;
v189 = v867;
}
_R11 = v188 + v889 + 1;
__asm
{
vmovups xmm4, xmmword ptr [r13+r11*4-4]
vaddps xmm0, xmm4, xmmword ptr [r13+r11*4+0]
vaddps xmm0, xmm0, xmmword ptr [r13+r11*4+4]
vaddps xmm0, xmm0, xmmword ptr [r13+r8*4+0]
}
__asm { vmovss xmm4, dword ptr [rbp-388h] }
_RDI = v901 + v188 + 1;
_RBX = v892;
__asm
{
vaddps xmm0, xmm0, xmmword ptr [r13+r8*4+0]
vmulps xmm0, xmm0, xmm3
vaddss xmm4, xmm4, xmm0
vshufps xmm5, xmm0, xmm0, 55h ; 'U'
vmovups xmmword ptr [rbx+rdi*4], xmm0
}
v202 = v189 + (v187 & 0xFFFFFFFC);
__asm
{
vaddss xmm4, xmm4, xmm5
vunpckhps xmm5, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm4, xmm4, xmm5
vaddss xmm4, xmm4, xmm0
vmovss dword ptr [rbp-388h], xmm4
}
if ( (v187 & 3) != 0 )
goto LABEL_50;
}
LABEL_53:
_RAX += v162;
v161 += v162;
v160 += v162;
v901 += v887;
_RCX += v885;
if ( v888 == v895 )
{
v147 = v162;
v149 = v865;
v150 = v864;
__asm { vzeroupper }
goto LABEL_55;
}
}
}
LODWORD(v900) = 0;
LABEL_55:
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::string::basic_string<std::allocator<char>>(v948, "blur_avx");
ScopedTimer::ScopedTimer(v949, v948, &v907);
std::string::_M_dispose(v948);
v890 = (char *)v934[0];
memcpy(v934[0], _R13, v147);
v235 = 4LL * (int)v933;
LODWORD(v901) = v933;
v869 = (int)v933;
memcpy(&v890[v235 * v149], v150, v147);
if ( v888 <= 1 )
{
LODWORD(v901) = 0;
goto LABEL_77;
}
v236 = v901;
if ( (_DWORD)v901 == 1 && (_RDX = _R13 + 4, _RAX = v890 + 4, (_DWORD)v899 == 1) )
{
do
{
__asm { vmovss xmm0, dword ptr [rdx] }
++v236;
_RDX += 4;
_RAX += 4;
__asm
{
vmovss dword ptr [rax-4], xmm0
vmovss xmm0, dword ptr [rdx-4]
vmovss dword ptr [rax-4], xmm0
}
}
while ( v888 != v236 );
}
else
{
v854 = 1;
_RSI = v898;
_RDX = &_R13[v147];
_RAX = &v890[v235];
do
{
__asm { vmovss xmm0, dword ptr [rdx] }
++v854;
__asm
{
vmovss dword ptr [rax], xmm0
vmovss xmm0, dword ptr [rdx+rsi*4-4]
}
_RDX += v147;
__asm { vmovss dword ptr [rax+rsi*4-4], xmm0 }
_RAX += v235;
}
while ( v888 != v854 );
}
v896 = 0;
__asm { vmovss xmm3, dword ptr cs:qword_6480 }
LODWORD(v901) = 0;
v240 = v898;
v886 = 1;
v883 = v899 - 1;
__asm
{
vbroadcastss ymm2, dword ptr cs:qword_6480
vshufps xmm4, xmm3, xmm3, 0
}
__asm { vmovq xmm5, cs:qword_6480 }
v243 = 8 * ((unsigned int)(v899 - 9) >> 3) + 17LL;
v893 = v869;
do
{
v244 = v240;
_RAX = &_R13[4 * v240];
++v886;
v240 += v898;
_RSI = &v890[4 * v893];
if ( (int)v899 <= 8 )
{
v260 = 1;
}
else
{
for ( _RDI = 9; ; _RDI += 8 )
{
__asm
{
vmovups ymm7, ymmword ptr [rax+rdi*4-20h]
vaddps ymm0, ymm7, ymmword ptr [rax+rdi*4-24h]
}
__asm
{
vaddps ymm0, ymm0, ymmword ptr [rax+rdi*4-1Ch]
vaddps ymm0, ymm0, ymmword ptr [rcx+rdi*4-20h]
vaddps ymm0, ymm0, ymmword ptr [rdx+rdi*4-20h]
vmulps ymm0, ymm0, ymm2
vmovaps xmm1, xmm0
vmovups ymmword ptr [rsi+rdi*4-20h], ymm0
vextractf128 xmm0, ymm0, 1
vaddps xmm0, xmm1, xmm0
vmovshdup xmm1, xmm0
vaddps xmm0, xmm0, xmm1
vmovhlps xmm1, xmm1, xmm0
vaddss xmm0, xmm0, xmm1
vaddss xmm6, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm6
}
if ( _RDI + 8 == v243 )
break;
}
v260 = _RDI;
}
if ( v260 < v883 )
{
_RDI = v260;
v262 = v899 - v260;
if ( (_DWORD)v899 - v260 != 2 )
{
v878 = v260 + v244;
v879 = v260 + v893;
_R8 = &v890[4 * v879];
v876 = v260 + v896;
v873 = v240 + v260;
if ( (unsigned __int64)(_R8 - &_R13[4 * v873 + 4]) > 8 && (unsigned __int64)(_R8 - &_R13[4 * v876 + 4]) > 8 )
{
_R14 = &_R13[4 * v878];
if ( (unsigned __int64)(_R8 - _R14) > 0x10 )
{
v265 = v262 - 1;
if ( (unsigned int)(v262 - 2) > 2 )
{
__asm { vmovups xmm0, xmmword ptr [r14] }
__asm
{
vaddps xmm0, xmm0, xmmword ptr [r13+r9-4]
vaddps xmm0, xmm0, xmmword ptr [r13+r9+4]
vaddps xmm0, xmm0, xmmword ptr [r13+r12-4]
vaddps xmm0, xmm0, xmmword ptr [r13+r14-4]
vmovss xmm1, dword ptr [rbp-380h]
vmulps xmm0, xmm0, xmm4
vaddss xmm1, xmm1, xmm0
vshufps xmm6, xmm0, xmm0, 55h ; 'U'
vmovups xmmword ptr [r8], xmm0
vaddss xmm1, xmm1, xmm6
vunpckhps xmm6, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm6
vaddss xmm6, xmm1, xmm0
vmovss dword ptr [rbp-380h], xmm6
}
if ( (v265 & 3) == 0 )
goto LABEL_75;
v278 = v265 & 0xFFFFFFFC;
v279 = v262 - v278;
v260 += v278;
v280 = v279 - 1;
if ( v279 != 2 )
{
LABEL_72:
_R8 = 4 * (v278 + v878);
__asm { vmovq xmm1, qword ptr [r13+r10*4+0] }
__asm { vmovq xmm0, qword ptr [r13+r8-4] }
__asm
{
vaddps xmm0, xmm0, xmm1
vmovq xmm1, qword ptr [r13+r8+4]
}
_R8 = v278 + v876;
__asm
{
vaddps xmm0, xmm0, xmm1
vmovq xmm1, qword ptr [r13+r8*4+0]
}
_R8 = v278 + v873;
_RDI = v278 + v879;
_R10 = v890;
__asm
{
vaddps xmm0, xmm0, xmm1
vmovq xmm1, qword ptr [r13+r8*4+0]
vaddps xmm0, xmm0, xmm1
vmovss xmm1, dword ptr [rbp-380h]
vmulps xmm0, xmm0, xmm5
vaddss xmm1, xmm1, xmm0
vmovlps qword ptr [r10+rdi*4], xmm0
vmovshdup xmm0, xmm0
vaddss xmm7, xmm0, xmm1
vmovss dword ptr [rbp-380h], xmm7
}
if ( (v280 & 1) == 0 )
goto LABEL_75;
v260 += v280 & 0xFFFFFFFE;
}
_R8 = 4LL * v260;
__asm
{
vmovss xmm0, dword ptr [rax+r8-4]
vaddss xmm0, xmm0, dword ptr [rax+rdi*4]
vaddss xmm0, xmm0, dword ptr [rax+r8+4]
vaddss xmm0, xmm0, dword ptr [rcx+rdi*4]
vaddss xmm0, xmm0, dword ptr [rdx+rdi*4]
vmulss xmm0, xmm0, xmm3
vaddss xmm6, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm6
vmovss dword ptr [rsi+rdi*4], xmm0
}
goto LABEL_75;
}
v280 = v262 - 1;
v278 = 0;
goto LABEL_72;
}
}
}
_R8 = 4LL * v260;
_R10 = &_RAX[_R8];
_R9 = (__int64)&_RAX[_R8 + 4];
__asm
{
vmovss xmm0, dword ptr [rax+r8-4]
vaddss xmm0, xmm0, dword ptr [r10]
vaddss xmm0, xmm0, dword ptr [r9]
vaddss xmm0, xmm0, dword ptr [rcx+rdi*4]
vaddss xmm0, xmm0, dword ptr [rdx+rdi*4]
vmulss xmm0, xmm0, xmm3
vaddss xmm7, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm7
vmovss dword ptr [rsi+rdi*4], xmm0
}
if ( v883 > v260 + 1 )
{
__asm
{
vmovss xmm0, dword ptr [r10]
vaddss xmm0, xmm0, dword ptr [r9]
vaddss xmm0, xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [rcx+r8+4]
vaddss xmm0, xmm0, dword ptr [rdx+r8+4]
}
__asm
{
vmulss xmm0, xmm0, xmm3
vaddss xmm6, xmm0, xmm7
vmovss dword ptr [rsi+r8+4], xmm0
vmovss dword ptr [rbp-380h], xmm6
}
if ( v260 + 2 < v883 )
{
_R10 = (__int64)&_RAX[_R8 + 12];
__asm
{
vmovss xmm0, dword ptr [r9]
vaddss xmm0, xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [r10]
vaddss xmm0, xmm0, dword ptr [rcx+r8+8]
vaddss xmm0, xmm0, dword ptr [rdx+r8+8]
}
__asm
{
vmulss xmm0, xmm0, xmm3
vaddss xmm6, xmm0, xmm6
vmovss dword ptr [rsi+r8+8], xmm0
vmovss dword ptr [rbp-380h], xmm6
}
if ( v883 > v260 + 3 )
{
__asm { vmovss xmm0, dword ptr [rax+r8+8] }
__asm
{
vaddss xmm0, xmm0, dword ptr [r10]
vaddss xmm0, xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [rcx+r8+0Ch]
vaddss xmm0, xmm0, dword ptr [rdx+r8+0Ch]
vmulss xmm0, xmm0, xmm3
vaddss xmm7, xmm0, xmm6
vmovss dword ptr [rsi+r8+0Ch], xmm0
vmovss dword ptr [rbp-380h], xmm7
}
if ( v883 > v260 + 4 )
{
__asm
{
vmovss xmm0, dword ptr [r10]
vaddss xmm0, xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [r9]
vaddss xmm0, xmm0, dword ptr [rcx+r8+10h]
vaddss xmm0, xmm0, dword ptr [rdx+r8+10h]
}
__asm
{
vmulss xmm0, xmm0, xmm3
vaddss xmm6, xmm0, xmm7
vmovss dword ptr [rsi+r8+10h], xmm0
vmovss dword ptr [rbp-380h], xmm6
}
if ( v883 > v260 + 5 )
{
__asm
{
vmovss xmm0, dword ptr [rax+r8+10h]
vaddss xmm0, xmm0, dword ptr [r9]
vaddss xmm0, xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [rcx+r8+14h]
vaddss xmm0, xmm0, dword ptr [rdx+r8+14h]
}
__asm
{
vmulss xmm0, xmm0, xmm3
vaddss xmm7, xmm0, xmm6
vmovss dword ptr [rsi+r8+14h], xmm0
vmovss dword ptr [rbp-380h], xmm7
}
if ( v883 > v260 + 6 )
{
__asm
{
vmovss xmm0, dword ptr [rax+r8+14h]
vaddss xmm0, xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [rax+r8+1Ch]
vaddss xmm0, xmm0, dword ptr [rcx+r8+18h]
vaddss xmm0, xmm0, dword ptr [rdx+r8+18h]
vmulss xmm0, xmm0, xmm3
vaddss xmm6, xmm0, xmm7
vmovss dword ptr [rsi+r8+18h], xmm0
vmovss dword ptr [rbp-380h], xmm6
}
}
}
}
}
}
}
}
LABEL_75:
v893 += v869;
v896 += v898;
}
while ( v888 != v886 );
__asm { vzeroupper }
LABEL_77:
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Blur scalar: checksum=",
22,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v305);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-388h] }
v307 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v307,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v308);
__asm { vmovsd xmm0, qword ptr [rbp-350h] }
v310 = std::ostream::_M_insert<double>(v307, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v310, " ms\n");
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Blur AVX : checksum=",
22,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v311);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-380h] }
v313 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v313,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v314);
__asm { vmovsd xmm0, qword ptr [rbp-348h] }
v316 = std::ostream::_M_insert<double>(v313, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v316, " ms\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Checksum delta (AVX - scalar): ",
31,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v317);
__asm
{
vmovss xmm3, dword ptr [rbp-380h]
vsubss xmm0, xmm3, dword ptr [rbp-388h]
}
__asm { vcvtss2sd xmm0, xmm0, xmm0 }
v321 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v321, "\n");
std::operator<<<std::char_traits<char>>(&std::cout, "--------------------------------------------------------\n\n");
ComplexSoA::ComplexSoA((ComplexSoA *)&v935, 0x40000u);
ComplexSoA::ComplexSoA((ComplexSoA *)v938, 0x40000u);
ComplexSoA::ComplexSoA((ComplexSoA *)v940, 0);
ComplexSoA::ComplexSoA((ComplexSoA *)v942, 0);
ComplexSoA::ComplexSoA((ComplexSoA *)v944, 0);
ComplexSoA::ComplexSoA((ComplexSoA *)v946, 0);
fill_complex((ComplexSoA *)&v935, 0x1234ABCDu);
fill_complex((ComplexSoA *)v938, 0x9876FEDC);
std::vector<float>::vector(&v922, 16, v322);
_R12 = v922;
__asm { vmovss xmm2, cs:dword_6008 }
_RBX = 0;
__asm { vxorps xmm1, xmm1, xmm1 }
while ( 1 )
{
__asm
{
vsubss xmm0, xmm1, cs:dword_6018
vmulss xmm0, xmm0, cs:dword_601C
vmovss xmm3, cs:dword_6020
vxorps xmm1, xmm0, cs:xmmword_6490
vfnmadd132ss xmm2, xmm3, xmm3
vmulss xmm0, xmm1, xmm0; x
vmovss dword ptr [rbp-380h], xmm2
}
*(float *)&_XMM0 = expf(*(float *)&_XMM0);
__asm
{
vmulss xmm0, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [r12+rbx*4], xmm0
}
if ( ++_RBX == 16 )
break;
__asm
{
vxorps xmm3, xmm3, xmm3
vcvtsi2ss xmm1, xmm3, rbx
vaddss xmm0, xmm1, cs:dword_6020
vmulss xmm0, xmm0, cs:dword_6024
vmulss xmm0, xmm0, cs:dword_6028; x
vmovss dword ptr [rbp-380h], xmm1
}
*(float *)&_XMM0 = cosf(*(float *)&_XMM0);
__asm
{
vmovss xmm1, dword ptr [rbp-380h]
vmovaps xmm2, xmm0
}
}
v891 = _R12;
std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 3: Complex multiply + FIR convolution ===\n");
v908 = 0;
v909 = 0;
std::string::basic_string<std::allocator<char>>(v948, "complex_mul_scalar");
ScopedTimer::ScopedTimer(v949, v948, &v908);
std::string::_M_dispose(v948);
_RBX = v935;
v874 = v936;
v340 = v936 - v935;
v900 = v936 - v935;
v341 = (v936 - v935) >> 2;
std::vector<float>::resize(v940, v341);
std::vector<float>::resize(v941, v340 >> 2);
if ( v340 )
{
v900 = v340;
_RCX = v937;
_RSI = v938[0];
_RDI = v939;
_RAX = v940[0];
_RDX = v941[0];
if ( (unsigned __int64)v340 > 0xC
&& (unsigned __int64)(v941[0] - (v939 + 4)) > 0x18
&& (unsigned __int64)(v940[0] - (v939 + 4)) > 0x18
&& (unsigned __int64)(v940[0] - (v938[0] + 4LL)) > 0x18
&& (unsigned __int64)(v941[0] - (_RBX + 4)) > 0x18
&& (unsigned __int64)(v940[0] - (_RBX + 4)) > 0x18
&& (unsigned __int64)(v940[0] - (v937 + 4)) > 0x18
&& (unsigned __int64)(v941[0] - (v937 + 4)) > 0x18
&& (unsigned __int64)(v941[0] - (v938[0] + 4LL)) > 0x18
&& (unsigned __int64)(v941[0] - (v940[0] + 4LL)) > 0x18 )
{
if ( v900 <= 0x1C )
{
LODWORD(v898) = 0;
_R8 = 0;
goto LABEL_89;
}
_R8 = 0;
LODWORD(v898) = 0;
__asm { vbroadcastss ymm3, cs:dword_602C }
__asm { vbroadcastss ymm2, cs:dword_6020 }
do
{
__asm
{
vmovups ymm4, ymmword ptr [rcx+r8]
vmulps ymm1, ymm4, ymmword ptr [rdi+r8]
vmulps ymm0, ymm4, ymmword ptr [rsi+r8]
vmovups ymm5, ymmword ptr [rbx+r8]
vfmsub231ps ymm1, ymm5, ymmword ptr [rsi+r8]
vfmadd231ps ymm0, ymm5, ymmword ptr [rdi+r8]
vmovups ymmword ptr [rax+r8], ymm1
vmovups ymmword ptr [rdx+r8], ymm0
vmulps ymm0, ymm0, ymm3
}
_R8 += 32;
__asm
{
vfmadd132ps ymm1, ymm0, ymm2
vmovss xmm0, dword ptr [rbp-398h]
vaddss xmm0, xmm0, xmm1
vshufps xmm5, xmm1, xmm1, 55h ; 'U'
vshufps xmm4, xmm1, xmm1, 0FFh
vaddss xmm0, xmm0, xmm5
vunpckhps xmm5, xmm1, xmm1
vextractf128 xmm1, ymm1, 1
vaddss xmm0, xmm0, xmm5
vaddss xmm0, xmm0, xmm4
vshufps xmm4, xmm1, xmm1, 55h ; 'U'
vaddss xmm0, xmm0, xmm1
vaddss xmm0, xmm0, xmm4
vunpckhps xmm4, xmm1, xmm1
vshufps xmm1, xmm1, xmm1, 0FFh
vaddss xmm0, xmm0, xmm4
vaddss xmm4, xmm0, xmm1
vmovss dword ptr [rbp-398h], xmm4
}
}
while ( _R8 != 32 * (v341 >> 3) );
_R8 = v341 & 0xFFFFFFFFFFFFFFF8LL;
if ( (v341 & 7) != 0 )
{
__asm { vzeroupper }
LABEL_89:
v370 = v341 - _R8;
if ( v341 - _R8 - 1 <= 2 )
goto LABEL_229;
__asm
{
vmovups xmm3, xmmword ptr [rcx+r8*4]
vmulps xmm0, xmm3, xmmword ptr [rdi+r8*4]
}
_R10 = 4 * _R8;
__asm
{
vbroadcastss xmm2, cs:dword_602C
vmovups xmm3, xmmword ptr [rbx+r8*4]
vfmsub231ps xmm0, xmm3, xmmword ptr [rsi+r8*4]
vmovups xmm3, xmmword ptr [rcx+r8*4]
vmulps xmm1, xmm3, xmmword ptr [rsi+r8*4]
vmovups xmm3, xmmword ptr [rbx+r8*4]
vfmadd231ps xmm1, xmm3, xmmword ptr [rdi+r8*4]
vmovups xmmword ptr [rax+r10], xmm0
vmovups xmmword ptr [rdx+r10], xmm1
vmulps xmm1, xmm1, xmm2
}
__asm { vbroadcastss xmm2, cs:dword_6020 }
_R8 += v370 & 0xFFFFFFFFFFFFFFFCLL;
__asm
{
vfmadd132ps xmm0, xmm1, xmm2
vmovss xmm1, dword ptr [rbp-398h]
vaddss xmm1, xmm1, xmm0
vshufps xmm2, xmm0, xmm0, 55h ; 'U'
vaddss xmm1, xmm1, xmm2
vunpckhps xmm2, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm2
vaddss xmm3, xmm1, xmm0
vmovss dword ptr [rbp-398h], xmm3
}
if ( (v370 & 3) != 0 )
{
LABEL_229:
__asm
{
vmovss xmm4, dword ptr [rcx+r8*4]
vmovss xmm0, dword ptr [rdi+r8*4]
}
_R9 = 4 * _R8;
__asm
{
vmovss xmm3, dword ptr [rbx+r8*4]
vmovss xmm2, dword ptr [rsi+r8*4]
vmulss xmm1, xmm0, xmm4
vfmsub231ss xmm1, xmm2, xmm3
vmulss xmm2, xmm2, xmm4
vmovss dword ptr [rax+r8*4], xmm1
vfmadd132ss xmm0, xmm2, xmm3
vmovss xmm3, cs:dword_602C
vmovss dword ptr [rdx+r8*4], xmm0
vmulss xmm0, xmm0, xmm3
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, dword ptr [rbp-398h]
vmovss dword ptr [rbp-398h], xmm6
}
if ( _R8 + 1 < v341 )
{
__asm
{
vmovss xmm2, dword ptr [rcx+r9+4]
vmovss xmm5, dword ptr [rsi+r9+4]
}
__asm
{
vmovss xmm4, dword ptr [rdi+r9+4]
vmovss xmm0, dword ptr [rbx+r9+4]
vmulss xmm1, xmm2, xmm4
vmulss xmm2, xmm2, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm2, xmm4
vmovss dword ptr [rax+r9+4], xmm1
vmovss dword ptr [rdx+r9+4], xmm0
vmulss xmm0, xmm0, xmm3
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, xmm6
vmovss dword ptr [rbp-398h], xmm6
}
if ( _R8 + 2 < v341 )
{
__asm
{
vmovss xmm2, dword ptr [rcx+r9+8]
vmovss xmm5, dword ptr [rsi+r9+8]
vmovss xmm4, dword ptr [rdi+r9+8]
vmovss xmm0, dword ptr [rbx+r9+8]
vmulss xmm1, xmm2, xmm4
vmulss xmm2, xmm2, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm2, xmm4
vmovss dword ptr [rax+r9+8], xmm1
vmovss dword ptr [rdx+r9+8], xmm0
vmulss xmm0, xmm0, xmm3
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm3, xmm1, xmm6
vmovss dword ptr [rbp-398h], xmm3
}
}
}
}
}
else
{
__asm { vzeroupper }
}
}
else
{
LODWORD(v898) = 0;
__asm { vmovss xmm5, cs:dword_602C }
_R8 = 0;
do
{
__asm
{
vmovss xmm3, dword ptr [rdi+r8*4]
vmovss xmm2, dword ptr [rcx+r8*4]
vmovss xmm4, dword ptr [rsi+r8*4]
vmovss xmm0, dword ptr [rbx+r8*4]
vmulss xmm1, xmm2, xmm3
vmulss xmm2, xmm2, xmm4
vfmsub231ss xmm1, xmm0, xmm4
vfmadd132ss xmm0, xmm2, xmm3
vmovss dword ptr [rax+r8*4], xmm1
vmovss dword ptr [rdx+r8*4], xmm0
vmulss xmm0, xmm0, xmm5
}
++_R8;
__asm
{
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm3, xmm1, dword ptr [rbp-398h]
vmovss dword ptr [rbp-398h], xmm3
}
}
while ( _R8 < v341 );
}
}
else
{
LODWORD(v898) = 0;
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::string::basic_string<std::allocator<char>>(v948, "complex_mul_avx");
ScopedTimer::ScopedTimer(v949, v948, &v909);
std::string::_M_dispose(v948);
std::vector<float>::resize(v942, v341);
std::vector<float>::resize(v943, v341);
if ( v900 <= 0x1C )
{
LODWORD(v901) = 0;
_RAX = 0;
}
else
{
LODWORD(v901) = 0;
_R9 = v937;
_RAX = 8;
_RSI = v942[0];
_RCX = v943[0];
__asm { vbroadcastss ymm3, cs:dword_602C }
__asm { vbroadcastss ymm2, cs:dword_6020 }
while ( 1 )
{
__asm
{
vmovups ymm5, ymmword ptr [r9+rax*4-20h]
vmulps ymm0, ymm5, ymmword ptr [rdi+rax*4-20h]
}
__asm
{
vmovups ymm4, ymmword ptr [rbx+rax*4-20h]
vfmsub231ps ymm0, ymm4, ymmword ptr [r8+rax*4-20h]
vmulps ymm1, ymm5, ymmword ptr [r8+rax*4-20h]
vfmadd231ps ymm1, ymm4, ymmword ptr [rdi+rax*4-20h]
vmovups ymmword ptr [rsi+rax*4-20h], ymm0
vmulps ymm0, ymm0, ymm2
vmovups ymmword ptr [rcx+rax*4-20h], ymm1
vfmadd132ps ymm1, ymm0, ymm3
vmovaps xmm0, xmm1
vextractf128 xmm1, ymm1, 1
vaddps xmm0, xmm0, xmm1
vmovshdup xmm1, xmm0
vaddps xmm0, xmm0, xmm1
vmovhlps xmm1, xmm1, xmm0
vaddss xmm0, xmm0, xmm1
vaddss xmm5, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm5
}
if ( v341 < _RAX + 8 )
break;
_RAX += 8LL;
}
}
if ( _RAX < v341 )
{
_R15 = v937;
_RDX = 4 * _RAX;
_R8 = v938[0];
_RSI = v939;
v899 = v943[0];
_RDI = v942[0];
v881 = v341 - _RAX;
v870 = v341 - _RAX - 1;
if ( v870 <= 2 )
goto LABEL_187;
_RDX = 4 * _RAX;
v443 = 4 * _RAX + 4;
v894 = v942[0] + 4 * _RAX;
v897 = v899 + 4 * _RAX;
v444 = v894 - (_RBX + v443);
v445 = v444 < 0x18;
v446 = v444 == 24;
_R12 = v894;
v448 = !v445
&& !v446
&& (unsigned __int64)(v897 - (_RBX + v443)) > 0x18
&& (unsigned __int64)(v894 - (v937 + v443)) > 0x18;
v449 = v897 - (v937 + v443);
v445 = v449 < 0x18;
v446 = v449 == 24;
_R14 = v897;
if ( (unsigned __int64)(v897 - (v939 + v443)) <= 0x18
|| (unsigned __int64)(v897 - (v938[0] + v443)) <= 0x18
|| v445
|| v446
|| !v448
|| (unsigned __int64)(v894 - (v938[0] + v443)) <= 0x18
|| (unsigned __int64)(v894 - (v939 + v443)) <= 0x18 )
{
goto LABEL_187;
}
if ( (unsigned __int64)(v897 - (v942[0] + v443)) > 0x18 )
{
if ( v870 <= 6 )
{
_RCX = _RAX;
LABEL_107:
__asm
{
vmovups xmm4, xmmword ptr [r15+rcx*4]
vmovups xmm1, xmmword ptr [rsi+rcx*4]
}
_RDX = 4 * _RCX;
__asm
{
vmovups xmm3, xmmword ptr [rbx+rcx*4]
vmovups xmm2, xmmword ptr [r8+rcx*4]
vmulps xmm0, xmm1, xmm4
}
_RCX = v899;
__asm
{
vfmsub231ps xmm0, xmm2, xmm3
vmulps xmm2, xmm2, xmm4
vmovups xmmword ptr [rdi+rdx], xmm0
vfmadd132ps xmm1, xmm2, xmm3
vbroadcastss xmm2, cs:dword_602C
vmovups xmmword ptr [rcx+rdx], xmm1
vmulps xmm1, xmm1, xmm2
}
__asm { vbroadcastss xmm2, cs:dword_6020 }
_RAX += v881 & 0xFFFFFFFFFFFFFFFCLL;
__asm
{
vfmadd132ps xmm0, xmm1, xmm2
vmovss xmm1, dword ptr [rbp-380h]
vaddss xmm1, xmm1, xmm0
vshufps xmm2, xmm0, xmm0, 55h ; 'U'
vaddss xmm1, xmm1, xmm2
vunpckhps xmm2, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm2
vaddss xmm3, xmm1, xmm0
vmovss dword ptr [rbp-380h], xmm3
}
if ( (v881 & 3) != 0 )
{
LABEL_108:
__asm
{
vmovss xmm4, dword ptr [r15+rax*4]
vmovss xmm0, dword ptr [rsi+rax*4]
}
_RDX = 4 * _RAX;
__asm
{
vmovss xmm3, dword ptr [rbx+rax*4]
vmovss xmm2, dword ptr [r8+rax*4]
vmulss xmm1, xmm0, xmm4
}
_R11 = v899;
__asm
{
vfmsub231ss xmm1, xmm2, xmm3
vmulss xmm2, xmm2, xmm4
vmovss dword ptr [rdi+rax*4], xmm1
vfmadd132ss xmm0, xmm2, xmm3
vmovss xmm3, cs:dword_602C
vmovss dword ptr [r11+rax*4], xmm0
vmulss xmm0, xmm0, xmm3
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm6
}
if ( _RAX + 1 < v341 )
{
__asm
{
vmovss xmm4, dword ptr [rsi+rdx+4]
vmovss xmm0, dword ptr [rbx+rdx+4]
}
__asm
{
vmovss xmm2, dword ptr [r15+rdx+4]
vmovss xmm5, dword ptr [r8+rdx+4]
vmulss xmm1, xmm2, xmm4
vmulss xmm2, xmm2, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm2, xmm4
vmovss dword ptr [rdi+rdx+4], xmm1
vmovss dword ptr [r11+rdx+4], xmm0
vmulss xmm0, xmm0, xmm3
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm7, xmm1, xmm6
vmovss dword ptr [rbp-380h], xmm7
}
if ( _RAX + 2 < v341 )
{
__asm
{
vmovss xmm5, dword ptr [r15+rdx+8]
vmovss xmm0, dword ptr [rsi+rdx+8]
vmovss xmm4, dword ptr [rbx+rdx+8]
vmovss xmm2, dword ptr [r8+rdx+8]
vmulss xmm1, xmm0, xmm5
vfmsub231ss xmm1, xmm2, xmm4
vmulss xmm2, xmm2, xmm5
vmovss dword ptr [rdi+rdx+8], xmm1
vfmadd132ss xmm0, xmm2, xmm4
vmovss dword ptr [r11+rdx+8], xmm0
vmulss xmm0, xmm0, xmm3
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm3, xmm1, xmm7
vmovss dword ptr [rbp-380h], xmm3
}
}
}
}
}
else
{
__asm
{
vmovups ymm4, ymmword ptr [r15+rdx]
vmovups ymm1, ymmword ptr [rsi+rdx]
vmovups ymm3, ymmword ptr [rbx+rdx]
vmovups ymm2, ymmword ptr [r8+rdx]
vmulps ymm0, ymm1, ymm4
}
v455 = v341 - _RAX;
v456 = v881 & 0xFFFFFFFFFFFFFFF8LL;
_RCX = _RAX + (v881 & 0xFFFFFFFFFFFFFFF8LL);
__asm { vfmsub231ps ymm0, ymm2, ymm3 }
_RAX = _RCX;
__asm
{
vmulps ymm2, ymm2, ymm4
vmovups ymmword ptr [r12], ymm0
vfmadd132ps ymm1, ymm2, ymm3
vbroadcastss ymm2, cs:dword_602C
vmovups ymmword ptr [r14], ymm1
vmulps ymm1, ymm1, ymm2
vbroadcastss ymm2, cs:dword_6020
vfmadd132ps ymm0, ymm1, ymm2
vmovss xmm2, dword ptr [rbp-380h]
vaddss xmm2, xmm2, xmm0
vshufps xmm3, xmm0, xmm0, 55h ; 'U'
vshufps xmm1, xmm0, xmm0, 0FFh
vaddss xmm3, xmm3, xmm2
vunpckhps xmm2, xmm0, xmm0
vextractf128 xmm0, ymm0, 1
vaddss xmm2, xmm2, xmm3
vaddss xmm1, xmm1, xmm2
vshufps xmm2, xmm0, xmm0, 55h ; 'U'
vaddss xmm1, xmm0, xmm1
vaddss xmm2, xmm2, xmm1
vunpckhps xmm1, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm2
vaddss xmm3, xmm1, xmm0
vmovss dword ptr [rbp-380h], xmm3
}
if ( (v881 & 7) != 0 )
{
v881 -= v456;
if ( v455 - v456 - 1 > 2 )
goto LABEL_107;
goto LABEL_108;
}
}
}
else
{
LABEL_187:
__asm
{
vmovss xmm2, dword ptr [r15+rax*4]
vmovss xmm4, dword ptr [r8+rax*4]
}
__asm
{
vmovss xmm3, dword ptr [rsi+rax*4]
vmovss xmm0, dword ptr [rbx+rax*4]
}
_R11 = v899;
__asm
{
vmulss xmm1, xmm2, xmm3
vmulss xmm2, xmm2, xmm4
vfmsub231ss xmm1, xmm0, xmm4
vfmadd132ss xmm0, xmm2, xmm3
vmovss xmm2, cs:dword_602C
vmovss dword ptr [rdi+rax*4], xmm1
vmovss dword ptr [r11+rax*4], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm6
}
if ( _RAX + 1 < v341 )
{
__asm
{
vmovss xmm4, dword ptr [rsi+rdx+4]
vmovss xmm0, dword ptr [rbx+rdx+4]
}
__asm
{
vmovss xmm3, dword ptr [r15+rdx+4]
vmovss xmm5, dword ptr [r8+rdx+4]
vmulss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [rdi+rdx+4], xmm1
vmovss dword ptr [r11+rdx+4], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, xmm6
vmovss dword ptr [rbp-380h], xmm6
}
if ( _RAX + 2 < v341 )
{
__asm
{
vmovss xmm0, dword ptr [rsi+rdx+8]
vmovss xmm5, dword ptr [r15+rdx+8]
}
__asm
{
vmovss xmm4, dword ptr [rbx+rdx+8]
vmovss xmm3, dword ptr [r8+rdx+8]
vmulss xmm1, xmm0, xmm5
vfmsub231ss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vmovss dword ptr [rdi+rdx+8], xmm1
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [r11+rdx+8], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, xmm6
vmovss dword ptr [rbp-380h], xmm6
}
if ( _RAX + 3 < v341 )
{
__asm
{
vmovss xmm4, dword ptr [rsi+rdx+0Ch]
vmovss xmm0, dword ptr [rbx+rdx+0Ch]
}
__asm
{
vmovss xmm3, dword ptr [r15+rdx+0Ch]
vmovss xmm5, dword ptr [r8+rdx+0Ch]
vmulss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [rdi+rdx+0Ch], xmm1
vmovss dword ptr [r11+rdx+0Ch], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm7, xmm1, xmm6
vmovss dword ptr [rbp-380h], xmm7
}
if ( _RAX + 4 < v341 )
{
__asm
{
vmovss xmm4, dword ptr [rsi+rdx+10h]
vmovss xmm0, dword ptr [rbx+rdx+10h]
}
__asm
{
vmovss xmm3, dword ptr [r15+rdx+10h]
vmovss xmm5, dword ptr [r8+rdx+10h]
vmulss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [rdi+rdx+10h], xmm1
vmovss dword ptr [r11+rdx+10h], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm7, xmm1, xmm7
vmovss dword ptr [rbp-380h], xmm7
}
if ( _RAX + 5 < v341 )
{
__asm
{
vmovss xmm4, dword ptr [rsi+rdx+14h]
vmovss xmm0, dword ptr [rbx+rdx+14h]
}
__asm
{
vmovss xmm3, dword ptr [r15+rdx+14h]
vmovss xmm5, dword ptr [r8+rdx+14h]
vmulss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [rdi+rdx+14h], xmm1
vmovss dword ptr [r11+rdx+14h], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm6, xmm1, xmm7
vmovss dword ptr [rbp-380h], xmm6
}
if ( _RAX + 6 < v341 )
{
__asm
{
vmovss xmm4, dword ptr [rsi+rdx+18h]
vmovss xmm0, dword ptr [rbx+rdx+18h]
}
__asm
{
vmovss xmm3, dword ptr [r15+rdx+18h]
vmovss xmm5, dword ptr [r8+rdx+18h]
vmulss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vfmsub231ss xmm1, xmm0, xmm5
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [rdi+rdx+18h], xmm1
vmovss dword ptr [r11+rdx+18h], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm7, xmm1, xmm6
vmovss dword ptr [rbp-380h], xmm7
}
if ( _RAX + 7 < v341 )
{
__asm
{
vmovss xmm5, dword ptr [r15+rdx+1Ch]
vmovss xmm0, dword ptr [rsi+rdx+1Ch]
vmovss xmm4, dword ptr [rbx+rdx+1Ch]
vmovss xmm3, dword ptr [r8+rdx+1Ch]
vmulss xmm1, xmm0, xmm5
vfmsub231ss xmm1, xmm3, xmm4
vmulss xmm3, xmm3, xmm5
vmovss dword ptr [rdi+rdx+1Ch], xmm1
vfmadd132ss xmm0, xmm3, xmm4
vmovss dword ptr [r11+rdx+1Ch], xmm0
vmulss xmm0, xmm0, xmm2
vfmadd132ss xmm1, xmm0, cs:dword_6020
vaddss xmm3, xmm1, xmm7
vmovss dword ptr [rbp-380h], xmm3
}
}
}
}
}
}
}
}
}
}
__asm { vzeroupper }
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
v910 = 0;
v911 = 0;
std::string::basic_string<std::allocator<char>>(v948, "complex_fir_scalar");
ScopedTimer::ScopedTimer(v949, v948, &v910);
std::string::_M_dispose(v948);
_R12 = v891;
v520 = v923 - v891;
v521 = (v923 - v891) >> 2;
std::vector<float>::resize(v944, v341);
std::vector<float>::resize(v945, v341);
if ( v900 )
{
_R8 = v944[0];
LODWORD(v900) = 0;
_RCX = 0;
_RDI = v945[0];
__asm
{
vmovss xmm4, cs:dword_6030
vmovss xmm3, cs:dword_6034
}
do
{
if ( v520 )
{
__asm { vxorps xmm1, xmm1, xmm1 }
v526 = 4 * _RCX;
_RAX = 0;
__asm { vmovaps xmm2, xmm1 }
do
{
__asm { vmovss xmm0, dword ptr [r12+rax*4] }
++_RAX;
__asm
{
vfmadd231ss xmm2, xmm0, dword ptr [rbx+rdx]
vfmadd231ss xmm1, xmm0, dword ptr [rsi+rdx]
}
v526 -= 4;
}
while ( _RCX >= _RAX && _RAX < v521 );
__asm
{
vmulss xmm0, xmm1, xmm4
vfmadd231ss xmm0, xmm2, xmm3
}
}
else
{
__asm
{
vxorps xmm0, xmm0, xmm0
vmovaps xmm1, xmm0
vmovaps xmm2, xmm0
}
}
__asm
{
vaddss xmm5, xmm0, dword ptr [rbp-388h]
vmovss dword ptr [r8+rcx*4], xmm2
vmovss dword ptr [rdi+rcx*4], xmm1
}
++_RCX;
__asm { vmovss dword ptr [rbp-388h], xmm5 }
}
while ( _RCX < v341 );
}
else
{
LODWORD(v900) = 0;
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::string::basic_string<std::allocator<char>>(v948, "complex_fir_avx");
ScopedTimer::ScopedTimer(v949, v948, &v911);
std::string::_M_dispose(v948);
std::vector<float>::resize(v946, v341);
std::vector<float>::resize(v947, v341);
if ( v874 == _RBX )
{
LODWORD(v899) = 0;
}
else
{
_RSI = v937;
_R9 = v946[0];
LODWORD(v899) = 0;
v533 = 1;
_R8 = v947[0];
_R12 = v891;
_RCX = 0;
__asm
{
vmovss xmm4, cs:dword_6030
vmovss xmm3, cs:dword_6034
}
do
{
if ( v520 <= 0x1C || v533 <= 7 )
{
__asm { vxorps xmm2, xmm2, xmm2 }
_RAX = 0;
__asm { vmovaps ymm1, ymm2 }
}
else
{
__asm { vxorps xmm2, xmm2, xmm2 }
_RDX = v533;
_RAX = 8;
__asm { vmovaps ymm1, ymm2 }
while ( 1 )
{
if ( v341 >= _RDX )
{
__asm
{
vmovups ymm6, ymmword ptr [rbx+rdx*4-20h]
vmovups ymm7, ymmword ptr [rsi+rdx*4-20h]
vfmadd231ps ymm1, ymm6, ymmword ptr [r12+rax*4-20h]
vfmadd231ps ymm2, ymm7, ymmword ptr [r12+rax*4-20h]
}
}
if ( v521 < _RAX + 8 )
break;
_RDX -= 8LL;
if ( v533 < _RAX + 8 )
break;
_RAX += 8LL;
}
}
__asm
{
vmovaps xmm0, xmm1
vextractf128 xmm1, ymm1, 1
vaddps xmm0, xmm0, xmm1
vmovshdup xmm1, xmm0
vaddps xmm0, xmm0, xmm1
vmovhlps xmm1, xmm1, xmm0
vaddss xmm0, xmm0, xmm1
vmovaps xmm1, xmm2
vextractf128 xmm2, ymm2, 1
vaddps xmm1, xmm1, xmm2
vmovshdup xmm2, xmm1
vaddps xmm1, xmm1, xmm2
vmovhlps xmm2, xmm2, xmm1
vaddss xmm1, xmm1, xmm2
}
if ( _RCX >= _RAX && _RAX < v521 )
{
v557 = 4 * (_RCX - _RAX);
do
{
__asm { vmovss xmm2, dword ptr [r12+rax*4] }
++_RAX;
__asm
{
vfmadd231ss xmm0, xmm2, dword ptr [rbx+rdx]
vfmadd231ss xmm1, xmm2, dword ptr [rsi+rdx]
}
v557 -= 4;
}
while ( _RCX >= _RAX && _RAX < v521 );
}
__asm { vmovss dword ptr [r9+rcx*4], xmm0 }
++v533;
__asm
{
vmovss dword ptr [r8+rcx*4], xmm1
vmulss xmm1, xmm1, xmm4
}
++_RCX;
__asm
{
vfmadd132ss xmm0, xmm1, xmm3
vaddss xmm6, xmm0, dword ptr [rbp-390h]
vmovss dword ptr [rbp-390h], xmm6
}
}
while ( _RCX < v341 );
__asm { vzeroupper }
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Complex mul scalar: checksum=",
29,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v559);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-398h] }
v561 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v561,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v562);
__asm { vmovsd xmm0, qword ptr [rbp-340h] }
v564 = std::ostream::_M_insert<double>(v561, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v564, " ms\n");
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Complex mul AVX : checksum=",
29,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v565);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-380h] }
v567 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v567,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v568);
__asm { vmovsd xmm0, qword ptr [rbp-338h] }
v570 = std::ostream::_M_insert<double>(v567, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v570, " ms\n");
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"FIR scalar : checksum=",
29,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v571);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-388h] }
v573 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v573,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v574);
__asm { vmovsd xmm0, qword ptr [rbp-330h] }
v576 = std::ostream::_M_insert<double>(v573, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v576, " ms\n");
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"FIR AVX : checksum=",
29,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v577);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-390h] }
v579 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v579,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v580);
__asm { vmovsd xmm0, qword ptr [rbp-328h] }
v582 = std::ostream::_M_insert<double>(v579, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v582, " ms\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Delta cmul checksum (AVX - scalar): ",
36,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v583);
__asm
{
vmovss xmm3, dword ptr [rbp-380h]
vsubss xmm0, xmm3, dword ptr [rbp-398h]
}
__asm { vcvtss2sd xmm0, xmm0, xmm0 }
v587 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v587, "\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Delta FIR checksum (AVX - scalar): ",
36,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v588);
__asm
{
vmovss xmm3, dword ptr [rbp-390h]
vsubss xmm0, xmm3, dword ptr [rbp-388h]
}
__asm { vcvtss2sd xmm0, xmm0, xmm0 }
v592 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v592, "\n");
std::operator<<<std::char_traits<char>>(&std::cout, "--------------------------------------------------------\n\n");
std::operator<<<std::char_traits<char>>(&std::cout, "=== Workload 4: Soft clip / limiter on FIR output ===\n");
v912 = 0;
v913 = 0;
std::vector<float>::vector(&v924, v944);
std::vector<float>::vector(&v926, &v924);
std::string::basic_string<std::allocator<char>>(v948, "soft_clip_scalar");
ScopedTimer::ScopedTimer(v949, v948, &v912);
std::string::_M_dispose(v948);
v594 = v924;
v595 = v925 - v924;
v596 = (v925 - v924) >> 2;
if ( v925 == v924 )
{
LODWORD(v900) = 0;
}
else
{
v597 = 1;
if ( v595 )
v597 = (v925 - v924) >> 2;
v598 = v597;
if ( v595 <= 0x1C )
{
LODWORD(v900) = 0;
v623 = 0;
goto LABEL_144;
}
_RAX = v924;
__asm { vpcmpeqd ymm4, ymm4, ymm4 }
LODWORD(v900) = 0;
__asm { vbroadcastss ymm3, cs:dword_6010 }
__asm { vbroadcastss ymm2, cs:dword_600C }
v603 = v924 + 32 * (v598 >> 3);
do
{
__asm { vmovups ymm0, ymmword ptr [rax] }
_RAX += 32;
__asm
{
vcmpltps ymm5, ymm2, ymm0
vcmpltps ymm1, ymm0, ymm3
vpor ymm1, ymm5, ymm1
vpxor ymm1, ymm1, ymm4
vblendvps ymm0, ymm3, ymm0, ymm1
vblendvps ymm0, ymm0, ymm2, ymm5
vmovss xmm5, dword ptr [rbp-388h]
vshufps xmm6, xmm0, xmm0, 55h ; 'U'
vshufps xmm1, xmm0, xmm0, 0FFh
vmovups ymmword ptr [rax-20h], ymm0
vaddss xmm5, xmm5, xmm0
vaddss xmm6, xmm6, xmm5
vunpckhps xmm5, xmm0, xmm0
vextractf128 xmm0, ymm0, 1
vaddss xmm5, xmm5, xmm6
vaddss xmm1, xmm1, xmm5
vshufps xmm5, xmm0, xmm0, 55h ; 'U'
vaddss xmm1, xmm0, xmm1
vaddss xmm5, xmm5, xmm1
vunpckhps xmm1, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm5
vaddss xmm5, xmm1, xmm0
vmovss dword ptr [rbp-388h], xmm5
}
}
while ( _RAX != v603 );
v623 = v598 & 0xFFFFFFFFFFFFFFF8LL;
if ( (v598 & 7) != 0 )
{
__asm { vzeroupper }
LABEL_144:
v624 = v598 - v623;
v625 = v598 - v623 - 1;
v626 = v625 <= 2;
if ( v625 <= 2 )
goto LABEL_230;
_RDX = v594 + 4 * v623;
__asm
{
vpcmpeqd xmm5, xmm5, xmm5
vbroadcastss xmm3, cs:dword_600C
vbroadcastss xmm0, cs:dword_6010
vmovups xmm2, xmmword ptr [rdx]
vcmpltps xmm4, xmm3, xmm2
vcmpltps xmm1, xmm2, xmm0
vpor xmm1, xmm4, xmm1
vpxor xmm1, xmm1, xmm5
vblendvps xmm0, xmm0, xmm2, xmm1
vmovss xmm1, dword ptr [rbp-388h]
vblendvps xmm0, xmm0, xmm3, xmm4
vaddss xmm1, xmm1, xmm0
vshufps xmm2, xmm0, xmm0, 55h ; 'U'
vmovups xmmword ptr [rdx], xmm0
}
v623 += v624 & 0xFFFFFFFFFFFFFFFCLL;
__asm
{
vaddss xmm2, xmm2, xmm1
vunpckhps xmm1, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm2
vaddss xmm3, xmm1, xmm0
vmovss dword ptr [rbp-388h], xmm3
}
v626 = (v624 & 3) == 0;
if ( (v624 & 3) != 0 )
{
LABEL_230:
v642 = 4 * v623;
_RDX = v594 + 4 * v623;
__asm
{
vmovss xmm0, dword ptr [rdx]
vcomiss xmm0, cs:dword_600C
}
if ( v626 )
{
__asm
{
vmovss xmm1, cs:dword_6010
vmaxss xmm0, xmm1, xmm0
}
}
else
{
__asm { vmovss xmm0, cs:dword_600C }
}
__asm
{
vaddss xmm3, xmm0, dword ptr [rbp-388h]
vmovss dword ptr [rdx], xmm0
}
__asm { vmovss dword ptr [rbp-388h], xmm3 }
if ( v623 + 1 < v596 )
{
_RDX = v594 + v642 + 4;
__asm
{
vmovss xmm0, dword ptr [rdx]
vcomiss xmm0, cs:dword_600C
}
if ( v623 + 1 > v596 )
{
__asm { vmovss xmm0, cs:dword_600C }
}
else
{
__asm
{
vmovss xmm1, cs:dword_6010
vmaxss xmm0, xmm1, xmm0
}
}
__asm { vaddss xmm3, xmm0, dword ptr [rbp-388h] }
v647 = v623 + 2;
__asm
{
vmovss dword ptr [rdx], xmm0
vmovss dword ptr [rbp-388h], xmm3
}
if ( v647 < v596 )
{
_RAX = v594 + v642 + 8;
__asm
{
vmovss xmm0, dword ptr [rax]
vcomiss xmm0, cs:dword_600C
}
if ( v647 <= v596 )
{
__asm
{
vmovss xmm1, cs:dword_6010
vmaxss xmm0, xmm1, xmm0
}
}
else
{
__asm { vmovss xmm0, cs:dword_600C }
}
__asm
{
vaddss xmm3, xmm0, dword ptr [rbp-388h]
vmovss dword ptr [rax], xmm0
vmovss dword ptr [rbp-388h], xmm3
}
}
}
}
}
else
{
__asm { vzeroupper }
}
}
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
std::string::basic_string<std::allocator<char>>(v948, "soft_clip_avx");
ScopedTimer::ScopedTimer(v949, v948, &v913);
std::string::_M_dispose(v948);
_RCX = v926;
v652 = (v927 - v926) >> 2;
if ( (unsigned __int64)(v927 - v926) <= 0x1C )
{
LODWORD(v901) = 0;
_RAX = 0;
}
else
{
LODWORD(v901) = 0;
_RAX = 8;
__asm
{
vbroadcastss ymm3, cs:dword_6010
vbroadcastss ymm2, cs:dword_600C
}
while ( 1 )
{
__asm { vmovups ymm4, ymmword ptr [rcx+rax*4-20h] }
__asm
{
vminps ymm1, ymm4, ymm2
vmaxps ymm1, ymm1, ymm3
vmovaps xmm0, xmm1
vmovups ymmword ptr [rcx+rax*4-20h], ymm1
vextractf128 xmm1, ymm1, 1
vaddps xmm0, xmm0, xmm1
vmovshdup xmm1, xmm0
vaddps xmm0, xmm0, xmm1
vmovhlps xmm1, xmm1, xmm0
vaddss xmm0, xmm0, xmm1
vaddss xmm7, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rbp-380h], xmm7
}
if ( v652 < _RAX + 8 )
break;
_RAX += 8LL;
}
}
if ( _RAX < v652 )
{
v665 = v652 - _RAX;
if ( v652 - _RAX - 1 <= 6 )
{
v685 = _RAX;
v681 = 0;
goto LABEL_162;
}
_RDI = _RCX + 4 * _RAX;
__asm
{
vpcmpeqd ymm5, ymm5, ymm5
vbroadcastss ymm3, cs:dword_600C
vbroadcastss ymm0, cs:dword_6010
vmovups ymm2, ymmword ptr [rdi]
vcmpltps ymm4, ymm3, ymm2
vcmpltps ymm1, ymm2, ymm0
vpor ymm1, ymm4, ymm1
vpxor ymm1, ymm1, ymm5
vblendvps ymm0, ymm0, ymm2, ymm1
vmovss xmm2, dword ptr [rbp-380h]
vblendvps ymm0, ymm0, ymm3, ymm4
vaddss xmm2, xmm2, xmm0
vshufps xmm3, xmm0, xmm0, 55h ; 'U'
vshufps xmm1, xmm0, xmm0, 0FFh
vmovups ymmword ptr [rdi], ymm0
}
v681 = v665 & 0xFFFFFFFFFFFFFFF8LL;
__asm
{
vaddss xmm3, xmm3, xmm2
vunpckhps xmm2, xmm0, xmm0
vextractf128 xmm0, ymm0, 1
}
v685 = _RAX + (v665 & 0xFFFFFFFFFFFFFFF8LL);
__asm
{
vaddss xmm2, xmm2, xmm3
vaddss xmm1, xmm1, xmm2
vshufps xmm2, xmm0, xmm0, 55h ; 'U'
vaddss xmm1, xmm0, xmm1
vaddss xmm2, xmm2, xmm1
vunpckhps xmm1, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm2
vaddss xmm3, xmm1, xmm0
vmovss dword ptr [rbp-380h], xmm3
}
if ( (v665 & 7) != 0 )
{
LABEL_162:
v691 = v665 - v681;
v692 = v691 - 1 <= 2;
if ( v691 - 1 <= 2 )
goto LABEL_231;
__asm { vbroadcastss xmm3, cs:dword_600C }
__asm
{
vpcmpeqd xmm5, xmm5, xmm5
vbroadcastss xmm0, cs:dword_6010
}
_RDI = _RCX + 4 * (v681 + _RAX);
__asm { vmovups xmm2, xmmword ptr [rdi] }
v685 += v691 & 0xFFFFFFFFFFFFFFFCLL;
v698 = v691 & 3;
v692 = v698 == 0;
__asm
{
vcmpltps xmm4, xmm3, xmm2
vcmpltps xmm1, xmm2, xmm0
vpor xmm1, xmm4, xmm1
vpxor xmm1, xmm1, xmm5
vblendvps xmm0, xmm0, xmm2, xmm1
vmovss xmm1, dword ptr [rbp-380h]
vblendvps xmm0, xmm0, xmm3, xmm4
vaddss xmm1, xmm1, xmm0
vshufps xmm2, xmm0, xmm0, 55h ; 'U'
vmovups xmmword ptr [rdi], xmm0
vaddss xmm2, xmm2, xmm1
vunpckhps xmm1, xmm0, xmm0
vshufps xmm0, xmm0, xmm0, 0FFh
vaddss xmm1, xmm1, xmm2
vaddss xmm3, xmm1, xmm0
vmovss dword ptr [rbp-380h], xmm3
}
if ( v698 )
{
LABEL_231:
v709 = 4 * v685;
_RAX = _RCX + 4 * v685;
__asm
{
vmovss xmm0, dword ptr [rax]
vcomiss xmm0, cs:dword_600C
}
if ( v692 )
{
__asm
{
vmovss xmm1, cs:dword_6010
vmaxss xmm0, xmm1, xmm0
}
}
else
{
__asm { vmovss xmm0, cs:dword_600C }
}
__asm
{
vaddss xmm3, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rax], xmm0
}
__asm { vmovss dword ptr [rbp-380h], xmm3 }
if ( v685 + 1 < v652 )
{
_RAX = _RCX + v709 + 4;
__asm
{
vmovss xmm0, dword ptr [rax]
vcomiss xmm0, cs:dword_600C
}
if ( v685 + 1 > v652 )
{
__asm { vmovss xmm0, cs:dword_600C }
}
else
{
__asm
{
vmovss xmm1, cs:dword_6010
vmaxss xmm0, xmm1, xmm0
}
}
__asm
{
vaddss xmm3, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rax], xmm0
}
__asm { vmovss dword ptr [rbp-380h], xmm3 }
if ( v685 + 2 < v652 )
{
_RAX = _RCX + v709 + 8;
__asm
{
vmovss xmm0, dword ptr [rax]
vcomiss xmm0, cs:dword_600C
}
if ( v685 + 2 <= v652 )
{
__asm
{
vmovss xmm1, cs:dword_6010
vmaxss xmm0, xmm1, xmm0
}
}
else
{
__asm { vmovss xmm0, cs:dword_600C }
}
__asm
{
vaddss xmm3, xmm0, dword ptr [rbp-380h]
vmovss dword ptr [rax], xmm0
vmovss dword ptr [rbp-380h], xmm3
}
}
}
}
}
}
__asm { vzeroupper }
ScopedTimer::~ScopedTimer((ScopedTimer *)v949);
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Soft clip scalar: checksum=",
27,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v716);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-388h] }
v718 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v718,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v719);
__asm { vmovsd xmm0, qword ptr [rbp-320h] }
v721 = std::ostream::_M_insert<double>(v718, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v721, " ms\n");
*(double *)&_XMM0 = std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Soft clip AVX : checksum=",
27,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v722);
__asm { vcvtss2sd xmm0, xmm0, dword ptr [rbp-380h] }
v724 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::__ostream_insert<char,std::char_traits<char>>(
v724,
" time=",
7,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v725);
__asm { vmovsd xmm0, qword ptr [rbp-318h] }
v727 = std::ostream::_M_insert<double>(v724, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v727, " ms\n");
std::__ostream_insert<char,std::char_traits<char>>(
&std::cout,
"Delta clip checksum (AVX - scalar): ",
36,
*(double *)&_XMM0,
*(double *)&_XMM1,
*(double *)&_XMM2,
*(double *)&_XMM3,
v728);
__asm
{
vmovss xmm3, dword ptr [rbp-380h]
vsubss xmm0, xmm3, dword ptr [rbp-388h]
}
__asm { vcvtss2sd xmm0, xmm0, xmm0 }
v732 = std::ostream::_M_insert<double>(&std::cout, *(double *)&_XMM0);
std::operator<<<std::char_traits<char>>(v732, "\n");
std::operator<<<std::char_traits<char>>(&std::cout, "\nDone.\n");
std::vector<float>::~vector(&v926);
std::vector<float>::~vector(&v924);
std::vector<float>::~vector(&v922);
std::vector<float>::~vector(v947);
std::vector<float>::~vector(v946);
std::vector<float>::~vector(v945);
std::vector<float>::~vector(v944);
std::vector<float>::~vector(v943);
std::vector<float>::~vector(v942);
std::vector<float>::~vector(v941);
std::vector<float>::~vector(v940);
std::vector<float>::~vector(&v939);
std::vector<float>::~vector(v938);
std::vector<float>::~vector(&v937);
std::vector<float>::~vector(&v935);
std::vector<float>::~vector(v934);
std::vector<float>::~vector(v932);
std::vector<float>::~vector(&v929);
std::vector<float>::~vector(v921);
std::vector<float>::~vector(&v919);
std::vector<float>::~vector(v918);
std::vector<float>::~vector(&v916);
std::vector<float>::~vector(&v914);
return 0;
}
extern unsigned long long g_400000;
extern void g_408040;
int main()
{
void* v0; // [bp-0x490]
unsigned long v1; // [bp-0x488]
char *v2; // [bp-0x460]
char *v3; // [bp-0x458]
unsigned int v4; // [bp-0x450]
unsigned int v5; // [bp-0x44c]
char *v6; // [bp-0x448]
char *v7; // [bp-0x440]
unsigned int v8; // [bp-0x438], Other Possible Types: unsigned long
void* v9; // [sp-0x430], Other Possible Types: unsigned long long
unsigned int v10; // [bp-0x430]
unsigned long v11; // [sp-0x428], Other Possible Types: unsigned long long
unsigned long long v12; // [sp-0x420]
unsigned int v13; // [sp-0x418], Other Possible Types: unsigned long long
unsigned int v14; // [bp-0x418], Other Possible Types: unsigned long
unsigned long v15; // [sp-0x410], Other Possible Types: unsigned long long
uint128_t *v16; // [bp-0x408], Other Possible Types: unsigned int, unsigned long long
char *v17; // [sp-0x400], Other Possible Types: unsigned long
char *v18; // [bp-0x3f8]
char *v19; // [bp-0x3f0], Other Possible Types: unsigned long long
unsigned int v20; // [bp-0x3f0]
char *v21; // [bp-0x3e8], Other Possible Types: char, unsigned int
char *v22; // [bp-0x3e0], Other Possible Types: unsigned int
unsigned long long v23; // [bp-0x3e0]
char *v24; // [bp-0x3d8]
unsigned long v25; // [bp-0x3d0]
unsigned int v26; // [bp-0x3c8]
void* v27; // [bp-0x3c0], Other Possible Types: unsigned int *, unsigned long
uint256_t *v28; // [sp-0x3b8], Other Possible Types: void*, unsigned long
void* v29; // [bp-0x3b0], Other Possible Types: uint256_t *, unsigned int
unsigned int v30; // [bp-0x3a8], Other Possible Types: unsigned long
void* v31; // [bp-0x3a0]
unsigned int v32; // [bp-0x3a0]
unsigned long long v33; // [bp-0x398], Other Possible Types: unsigned int
unsigned int v34; // [bp-0x390]
int v35; // [sp-0x390], Other Possible Types: unsigned long long
unsigned int v36; // [bp-0x390]
unsigned int v37; // [bp-0x390]
void* v38; // [bp-0x380]
void* v39; // [bp-0x378]
void* v40; // [bp-0x370]
void* v41; // [bp-0x368]
void* v42; // [bp-0x360]
void* v43; // [bp-0x358]
void* v44; // [bp-0x350]
void* v45; // [bp-0x348]
void* v46; // [bp-0x340]
void* v47; // [bp-0x338]
void* v48; // [bp-0x330]
void* v49; // [bp-0x328]
unsigned int *v50; // [bp-0x320], Other Possible Types: char
char v51; // [bp-0x318]
void* v52; // [bp-0x300], Other Possible Types: char
char v53; // [bp-0x2f8]
char v54; // [bp-0x2e0]
unsigned int *v55; // [bp-0x2c0], Other Possible Types: char
char v56; // [bp-0x2b8]
char v57; // [bp-0x2a0]
unsigned int *v58; // [bp-0x280], Other Possible Types: char
char v59; // [bp-0x278]
void* v60; // [bp-0x260], Other Possible Types: char
char v61; // [bp-0x258]
unsigned int *v62; // [bp-0x240], Other Possible Types: char
char v63; // [bp-0x238]
unsigned long long v64; // [bp-0x220]
unsigned int v65; // [bp-0x21c]
void* v66; // [bp-0x218], Other Possible Types: char
char v67; // [bp-0x210]
unsigned long long v68; // [bp-0x200]
char v69; // [bp-0x1f8]
unsigned long long v70; // [bp-0x1e0]
char v71; // [bp-0x1d8]
void* v72; // [bp-0x1c0], Other Possible Types: char
char v73; // [bp-0x1b8]
char v74; // [bp-0x1a8]
char v75; // [bp-0x190]
char v76; // [bp-0x178]
char v77; // [bp-0x160]
char v78; // [bp-0x148]
char v79; // [bp-0x130]
char v80; // [bp-0x118]
char v81; // [bp-0x100]
char v82; // [bp-0xe8]
char v83; // [bp-0xd0]
char v84; // [bp-0xb8]
char v85; // [bp-0xa0]
char v86; // [bp-0x80]
unsigned int *v88; // rbx
unsigned long long v89; // rcx
unsigned int *v90; // rdx
unsigned int *v91; // rdx
int v92; // xmm0
int v93; // xmm1
void* v94; // rdx
unsigned long long v95; // rcx
int v96; // ymm3, Other Possible Types: uint256_t
unsigned int *v97; // r12
unsigned long long v98; // rcx
unsigned int *v99; // rdx
unsigned int *v100; // rdx
void* v101; // rax
unsigned long v102; // rdx
uint256_t v104; // ymm0
int v105; // ymm2
uint256_t v106; // ymm2
int v107; // ymm4
uint256_t v108; // ymm4
int v109; // ymm5
uint256_t v110; // ymm5
int v111; // ymm6
uint256_t v112; // ymm6
int v113; // ymm7
uint256_t v114; // ymm7
uint256_t v116; // ymm0
void* v117; // rax
uint256_t v118; // ymm0
void* v119; // rax
uint256_t v120; // ymm0
uint256_t v121; // ymm0
uint256_t v122; // ymm0
uint256_t v123; // ymm0
uint256_t v124; // ymm0
uint256_t v125; // ymm0
unsigned int v126[8]; // rax
unsigned long long v127; // rdx
int v128; // ymm1
uint256_t v129; // ymm0
unsigned int v130[8]; // rax
uint256_t v131; // ymm0
unsigned int v132[8]; // rax
uint256_t v133; // ymm0
uint256_t v134; // ymm0
uint256_t v135; // ymm0
uint256_t v136; // ymm0
uint256_t v137; // ymm0
uint256_t v138; // ymm0
uint256_t v139; // ymm0
uint256_t v140; // ymm1
uint256_t v141; // ymm5
uint256_t v142; // ymm6
uint256_t v143; // ymm7
uint256_t v144; // ymm4
void* v145; // rax
uint256_t v146; // ymm2
uint256_t v149; // ymm0
uint256_t v150; // ymm1
uint256_t v151; // ymm1
unsigned long long v152; // r12
uint256_t v153; // ymm2
void* v154; // rax
int v155; // ymm3
int v156; // ymm4
uint256_t v157; // ymm0
uint256_t v158; // ymm4
uint256_t v159; // ymm4
uint256_t v160; // ymm0
uint256_t v161; // ymm4
uint256_t v162; // ymm0
uint256_t v163; // ymm4
uint256_t v164; // ymm3
uint256_t v165; // ymm0
uint256_t v166; // ymm1
uint256_t v167; // ymm0
uint256_t v168; // ymm1
uint256_t v169; // ymm0
uint256_t v170; // ymm2
uint256_t v171; // ymm1
uint256_t v172; // ymm0
uint256_t v173; // ymm1
uint256_t v174; // ymm1
uint256_t v175; // ymm0
uint256_t v176; // ymm1
uint256_t v177; // ymm0
uint256_t v178; // ymm1
uint256_t v179; // ymm3
void* v180; // r15
unsigned int v181; // rcx
uint256_t v182; // ymm3
unsigned long v183; // xmm0lq
unsigned long v184; // xmm0hq
void* v185; // rax
uint256_t v187; // ymm3
unsigned long v188; // xmm0lq
unsigned long v189; // xmm0hq
void* v190; // rax
uint256_t v191; // ymm0
uint256_t v193; // ymm3
unsigned long v194; // xmm0lq
unsigned long v195; // xmm0hq
void* v196; // rax
uint256_t v198; // ymm3
unsigned long v199; // xmm0lq
unsigned long v200; // xmm0hq
void* v201; // rax
uint256_t v202; // ymm0
void* v204; // r13
unsigned long long v205; // rcx
int v206; // ymm1
void* v207; // rdx
void* v208; // rdx
uint256_t v209; // ymm0
unsigned long v210; // rsi
unsigned long long v211; // r14
unsigned long v212; // rbx
void* v213; // r12
unsigned long v214; // rsi
unsigned int v215; // ecx
unsigned long v216; // rax
int v217; // ymm0, Other Possible Types: uint256_t
unsigned int *v218; // rcx
unsigned long long v219; // r9
unsigned int v220; // esi
unsigned int *v221; // rax
unsigned long v223; // r8
uint256_t v224; // ymm1
unsigned int v225; // r11d
unsigned int v226; // r11d
void* v227; // rax
int v228; // ymm2
unsigned int *v229; // rsi
void* v230; // rdx
unsigned long long v231; // r12
unsigned long v232; // r14
int v233; // ymm4, Other Possible Types: uint256_t
uint256_t v234; // ymm5
int v235; // ymm6, Other Possible Types: uint256_t
unsigned long v236; // r8
unsigned long v237; // r9
void* v238; // rdi
int v239; // ymm0
uint256_t v240; // ymm4
uint256_t v241; // ymm6
uint256_t v242; // ymm5
uint256_t v243; // ymm4
uint256_t v244; // ymm0
uint256_t v245; // ymm4
uint256_t v246; // ymm4
uint256_t v247; // ymm5
uint256_t v248; // ymm4
uint256_t v249; // ymm4
uint256_t v250; // ymm5
unsigned int v251; // r9d
unsigned long v252; // rdi
unsigned int v253; // r10d
unsigned int v254; // edi
unsigned long long v255; // r8
unsigned long v256; // r11
uint256_t v257; // ymm0
uint256_t v258; // ymm0
uint256_t v259; // ymm0
uint256_t v260; // ymm0
uint256_t v261; // ymm0
uint256_t v262; // ymm4
uint256_t v263; // ymm5
uint256_t v264; // ymm4
uint256_t v265; // ymm4
unsigned long v266; // r8
unsigned long v267; // r9
unsigned int *v268; // rbx
uint256_t v269; // ymm0
unsigned int *v270; // r11
uint256_t v271; // ymm0
uint256_t v272; // ymm0
uint256_t v273; // ymm0
unsigned int v274; // edi
uint256_t v275; // ymm0
uint256_t v276; // ymm0
uint256_t v277; // ymm0
uint256_t v278; // ymm0
unsigned long v279; // rdi
uint256_t v280; // ymm0
uint256_t v281; // ymm0
uint256_t v282; // ymm0
uint256_t v283; // ymm0
unsigned long long v284; // rdi
uint256_t v285; // ymm0
uint256_t v286; // ymm0
uint256_t v287; // ymm0
uint256_t v288; // ymm0
unsigned long v289; // rsi
unsigned long long v290; // r15
unsigned int v291; // ecx
unsigned long v292; // rax
int v293; // ymm0, Other Possible Types: uint256_t
unsigned int v294; // ecx
unsigned int *v295; // rax
uint256_t v296; // ymm3
unsigned long v297; // rbx
int v298; // ymm2
uint256_t v299; // ymm4
uint256_t v300; // ymm5
uint256_t v301; // ymm7
unsigned long v302; // rbx
unsigned long long v303; // rdx
unsigned int *v304; // rax
unsigned long v305; // rbx
unsigned int *v306; // rcx
unsigned int *v307; // rdx
unsigned int *v308; // rsi
unsigned long long v309; // rdi
unsigned long v310; // r8
int v311; // ymm0
uint256_t v312; // ymm1
uint256_t v313; // ymm0
uint256_t v314; // ymm0
uint256_t v315; // ymm1
uint256_t v316; // ymm0
unsigned long long v317; // r11
unsigned long long v318; // rdi
unsigned int v319; // r10d
unsigned long long v320; // r9
uint128_t *v321; // r8
unsigned int v322; // edi
uint256_t v323; // ymm0
uint256_t v324; // ymm0
uint256_t v325; // ymm0
uint256_t v326; // ymm0
uint256_t v327; // ymm0
uint256_t v328; // ymm0
uint256_t v329; // ymm1
uint256_t v330; // ymm6
uint256_t v331; // ymm1
uint256_t v332; // ymm6
void* v333; // rdi
unsigned int v334; // r10d
unsigned int v335; // r9d
int v336; // ymm1, Other Possible Types: uint256_t
unsigned long long v337; // r10
unsigned long long v338; // r8
uint256_t v339; // ymm1
uint256_t v340; // ymm0
uint256_t v341; // ymm1
uint256_t v342; // ymm0
uint256_t v343; // ymm1
uint256_t v344; // ymm0
uint256_t v345; // ymm0
unsigned long long v346; // rdi
uint256_t v347; // ymm0
uint256_t v348; // ymm0
uint256_t v349; // ymm0
uint256_t v350; // ymm0
unsigned int *v351; // r8
unsigned long v352; // r9
uint256_t v353; // ymm0
uint256_t v354; // ymm0
uint256_t v355; // ymm0
uint256_t v356; // ymm0
unsigned long v357; // rdi
uint256_t v358; // ymm0
uint256_t v359; // ymm0
uint256_t v360; // ymm0
uint256_t v361; // ymm0
unsigned long v362; // r10
uint256_t v363; // ymm0
uint256_t v364; // ymm0
uint256_t v365; // ymm0
uint256_t v366; // ymm0
unsigned long v367; // rdi
uint256_t v368; // ymm0
uint256_t v369; // ymm0
uint256_t v370; // ymm0
uint256_t v371; // ymm0
unsigned long v372; // r9
uint256_t v373; // ymm0
uint256_t v374; // ymm0
uint256_t v375; // ymm0
uint256_t v376; // ymm0
unsigned long v377; // rdi
uint256_t v378; // ymm0
uint256_t v379; // ymm0
uint256_t v380; // ymm0
uint256_t v381; // ymm0
uint256_t v382; // ymm0
uint256_t v383; // ymm0
uint256_t v384; // ymm0
uint256_t v385; // ymm0
unsigned long v386; // rcx
uint256_t v387; // ymm0
unsigned long v388; // xmm0hq
void* v389; // rax
uint256_t v391; // ymm0
unsigned long v392; // xmm0hq
void* v393; // rax
uint256_t v395; // ymm0
uint256_t v396; // ymm0
void* v398; // rbx
uint256_t v399; // ymm1
uint256_t v400; // ymm0
uint256_t v401; // ymm0
int v402; // ymm2, Other Possible Types: uint256_t
uint256_t v403; // ymm0
uint256_t v404; // ymm0
uint256_t v405; // ymm0
uint256_t v406; // ymm0
void* v407; // rbx
unsigned long long v408; // r15
unsigned long long v409; // r13
void* v410; // rax
void* v411; // rdx
unsigned long v412; // r8
unsigned long v413; // r9
void* v414; // r8
int v415; // ymm2
int v416; // ymm4
int v417; // ymm5
uint256_t v418; // ymm0
uint256_t v419; // ymm5
uint256_t v420; // ymm4
uint256_t v421; // ymm0
uint256_t v422; // ymm5
uint256_t v423; // ymm1
uint256_t v424; // ymm0
uint256_t v425; // ymm0
uint256_t v426; // ymm4
uint256_t v427; // ymm0
uint256_t v428; // ymm0
uint256_t v429; // ymm4
uint256_t v430; // ymm0
uint256_t v431; // ymm4
unsigned long long v432; // r8
unsigned long long v433; // r9
void* v434; // r10
uint256_t v435; // ymm0
unsigned long long v436; // r9
uint256_t v437; // ymm0
uint128_t v438; // xmm3
uint256_t v439; // ymm0
uint256_t v440; // ymm0
void* v441; // r8
uint256_t v442; // ymm0
unsigned long long v443; // rax
unsigned long long v444; // rcx
int v445; // ymm5
unsigned long v446; // rdx
uint256_t v447; // ymm0
uint256_t v448; // ymm1
uint256_t v449; // ymm0
uint256_t v450; // ymm1
void* v451; // r15
void* v452; // rdx
uint128_t *v453; // r8
void* v454; // rdi
unsigned long v455; // rcx
unsigned long long v456; // r11
unsigned long long v457; // rdx
unsigned long long v458; // rcx
int v459; // ymm2
uint256_t v460; // ymm2
uint256_t v461; // ymm1
uint256_t v462; // ymm2
uint256_t v463; // ymm0
uint256_t v464; // ymm2
uint256_t v465; // ymm1
uint256_t v466; // ymm2
void* v467; // rdx
uint256_t v468; // ymm2
uint256_t v469; // ymm0
unsigned long long v470; // rdx
uint256_t v471; // ymm0
uint128_t v472; // xmm3
uint256_t v473; // ymm0
uint256_t v474; // ymm0
uint256_t v475; // ymm0
uint256_t v476; // ymm0
uint256_t v477; // ymm0
uint256_t v478; // ymm0
uint256_t v479; // ymm0
uint256_t v480; // ymm0
uint256_t v481; // ymm0
uint256_t v482; // ymm0
uint256_t v483; // ymm0
uint256_t v484; // ymm2
uint256_t v485; // ymm4
uint256_t v486; // ymm5
int v487; // ymm6, Other Possible Types: uint256_t
unsigned long long v488; // r15
unsigned long long v489; // r14
void* v490; // rcx
unsigned int *v491; // rdi
void* v492; // rax
unsigned long long v493; // rdi
unsigned int *v494; // r8
void* v495; // rcx
uint256_t v496; // ymm2
unsigned long long v497; // rax
unsigned long long v498; // rdx
unsigned long long v499; // rdx
unsigned long v500; // r10
int v501; // ymm1
uint256_t v503; // ymm1
uint256_t v504; // ymm0
uint256_t v505; // ymm1
uint256_t v506; // ymm1
uint256_t v507; // ymm2
uint256_t v508; // ymm1
uint256_t v509; // ymm2
uint256_t v510; // ymm0
unsigned long v511; // xmm0hq
void* v512; // rax
uint256_t v514; // ymm0
unsigned long v515; // xmm0hq
void* v516; // rax
uint256_t v518; // ymm0
unsigned long v519; // xmm0hq
void* v520; // rax
uint256_t v522; // ymm0
unsigned long v523; // xmm0hq
void* v524; // rax
uint256_t v526; // ymm0
uint256_t v527; // ymm0
uint256_t v529; // ymm0
int v530; // ymm0, Other Possible Types: uint256_t
void* v532; // rsi
unsigned long long v533; // rcx
unsigned long long v534; // rdi
unsigned long v535; // rdx
int v536; // ymm3
int v537; // ymm2
void* v538; // rax
int v539; // ymm0
void* v540; // rax
int v541; // ymm5
int v542; // ymm1
uint256_t v543; // ymm0
uint256_t v544; // ymm6
uint256_t v545; // ymm1
uint256_t v546; // ymm5
uint256_t v547; // ymm5
uint256_t v548; // ymm0
uint256_t v549; // ymm5
uint256_t v550; // ymm1
uint256_t v551; // ymm5
uint256_t v552; // ymm1
uint256_t v553; // ymm5
uint256_t v554; // ymm1
uint256_t v555; // ymm0
uint256_t v556; // ymm5
unsigned long long v557; // rcx
unsigned long long v558; // rax
uint128_t *v559; // rdx
uint128_t v560; // xmm3
uint256_t v561; // ymm0
int v562; // xmm2
uint128_t v563; // xmm4
uint256_t v564; // ymm0
uint256_t v565; // ymm0
unsigned long v566; // rax
uint256_t v567; // ymm0
uint256_t v568; // ymm0
uint256_t v569; // ymm0
unsigned int *v570; // rcx
unsigned long long v571; // rax
unsigned long long v572; // rsi
unsigned long long v573; // rax
int v574; // ymm3
int v575; // ymm2
unsigned long v576; // rdx
int v577; // ymm1
uint256_t v578; // ymm1
uint256_t v579; // ymm0
uint256_t v580; // ymm1
unsigned long long v581; // rdx
uint256_t *v582; // rdi
int v583; // ymm3
int v584; // ymm0
int v585; // ymm2
uint256_t v586; // ymm0
unsigned long long v587; // rdi
uint256_t v588; // ymm0
unsigned long long v589; // r8
unsigned long long v590; // rdx
uint128_t v591; // xmm3
uint256_t v592; // ymm0
uint128_t *v593; // rdi
uint128_t v594; // xmm2
uint128_t v595; // xmm4
uint256_t v596; // ymm0
uint256_t v597; // ymm0
unsigned int *v598; // rdx
uint256_t v599; // ymm0
uint256_t v600; // ymm0
uint256_t v601; // ymm0
uint256_t v602; // ymm0
unsigned long v603; // xmm0hq
void* v604; // rax
uint256_t v606; // ymm0
unsigned long v607; // xmm0hq
void* v608; // rax
uint256_t v610; // ymm0
v50.allocator<float> const&) (.constprop.0)(&g_400000);
v7 = &v52;
v52.allocator<float> const&) (.constprop.0)(&g_400000);
v6 = &v54;
v54.allocator<float> const&) (.constprop.0)(&g_400000);
v3 = &v55;
v55.allocator<float> const&) (.constprop.0)(&g_400000);
v2 = &v57;
v57.allocator<float> const&) (.constprop.0)(&g_400000);
v88 = v50;
v89 = *((long long *)&v51);
if (v89 != v88)
{
v90 = v88;
do
{
v96 = v96 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v91 = v90 + 1;
v91[1] = (unsigned int)(MulV((v92 & 18446744073709551615 | ((uint128_t)(v96 >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, v93 & 340282366920938463463374607431768211455));
v90 = v91;
} while (v89 != v90);
}
v94 = v52;
v95 = *((long long *)&v53);
if (v95 != v94)
{
do
{
v96 = v96 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v94 += 4;
*((unsigned int *)&v94[4]) = (unsigned int)(MulV((v92 & 18446744073709551615 | ((uint128_t)(v96 >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, v93 & 340282366920938463463374607431768211455));
} while (v94 != v95);
}
v97 = v55;
v98 = *((long long *)&v56);
if (v98 != v97)
{
v99 = v97;
do
{
v96 = v96 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v100 = v99 + 1;
v100[1] = (unsigned int)(MulV((v92 & 18446744073709551615 | ((uint128_t)(v96 >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, v93 & 340282366920938463463374607431768211455));
v99 = v100;
} while (v98 != v99);
}
v2.allocator<float>> const&) (.isra.0)(v3);
g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 1: SAXPY + cosine similarity ===\n");
v38 = 0;
v39 = 0;
v6.allocator<float>> const&) (.isra.0)(v7);
v18 = &v85;
v85.allocator<char> const&) (.constprop.0)("saxpy_scalar");
v24 = &v86;
v86.ScopedTimer(&v85, &v38);
v85._M_dispose();
v101 = *((long long *)&v54);
v102 = 0;
if (v101 - (v88 + 1) > 24)
{
do
{
*((void*)((char *)v101 + v102)) = v92;
v102 += 32;
} while (v102 != 0x1000000);
v104 = v92 & 340282366920938463463374607431768211455;
v106 = v105 & 340282366920938463463374607431768211455;
v108 = v107 & 340282366920938463463374607431768211455;
v110 = v109 & 340282366920938463463374607431768211455;
v112 = v111 & 340282366920938463463374607431768211455;
v114 = v113 & 340282366920938463463374607431768211455;
}
else
{
do
{
*((unsigned int *)((char *)v101 + 4 * v102)) = (unsigned int)v92;
v102 += 1;
} while (v102 != 0x400000);
}
v116 = v104 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v117 = v101;
do
{
v118 = (v116 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v116 & 340282366920938463463374607431768211455, *((int *)v117))) & 340282366920938463463374607431768211455;
v119 = v117 + 32;
v120 = (v118 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v118, (int)v119[28])) & 340282366920938463463374607431768211455;
v121 = (v120 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v120, (int)v119[24])) & 340282366920938463463374607431768211455;
v122 = (v121 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v121, (int)v119[20])) & 340282366920938463463374607431768211455;
v123 = (v122 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v122, (int)v119[16])) & 340282366920938463463374607431768211455;
v124 = (v123 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v123, (int)v119[12])) & 340282366920938463463374607431768211455;
v125 = (v124 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v124, (int)v119[8])) & 340282366920938463463374607431768211455;
v116 = (v125 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v125, (int)v119[4])) & 340282366920938463463374607431768211455;
v117 = v119;
} while (v117 != v101 + 0x1000000);
v6.allocator<float>> const&) (.isra.0)(v7);
v18.allocator<char> const&) (.constprop.0)("saxpy_avx");
v24.ScopedTimer(v18, &v39);
v18._M_dispose();
v126 = *((long long *)&v54);
v127 = 8;
v128 = [D] unsupported_<class 'pyvex.expr.Qop'>();
do
{
*((void*)&v126[1 + v127]) = v92;
v127 += 8;
} while (v127 != 4194312);
v129 = v92 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v130 = v126;
do
{
v131 = (v129 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v129 & 340282366920938463463374607431768211455, v130[0])) & 340282366920938463463374607431768211455;
v132 = v130 + 1;
v133 = (v131 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v131, v132[7])) & 340282366920938463463374607431768211455;
v134 = (v133 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v133, v132[6])) & 340282366920938463463374607431768211455;
v135 = (v134 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v134, v132[5])) & 340282366920938463463374607431768211455;
v136 = (v135 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v135, v132[4])) & 340282366920938463463374607431768211455;
v137 = (v136 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v136, v132[3])) & 340282366920938463463374607431768211455;
v138 = (v137 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v137, v132[2])) & 340282366920938463463374607431768211455;
v129 = (v138 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v138, v132[1])) & 340282366920938463463374607431768211455;
v130 = v132;
} while (v126 + 0x80000 != v130);
v139 = v129 & 340282366920938463463374607431768211455;
v140 = v128 & 340282366920938463463374607431768211455;
v141 = v110 & 340282366920938463463374607431768211455;
v142 = v112 & 340282366920938463463374607431768211455;
v143 = v114 & 340282366920938463463374607431768211455;
v40 = 0;
v41 = 0;
v18.allocator<char> const&) (.constprop.0)("cosine_scalar");
v24.ScopedTimer(v18, &v40);
v18._M_dispose();
v144 = v108 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v145 = 0;
v146 = (v106 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v144 >> 64) CONCAT (unsigned long long)v144)) & 340282366920938463463374607431768211455;
do
{
v143 = v143 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v140 = ((v140 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((int *)((char *)v88 + 0x4 * v145))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v143 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
v139 = ((v139 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((int *)((char *)v97 + 0x4 * v145))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v143 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
v145 += 1;
v146 = (v146 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | [D] unsupported_<class 'pyvex.expr.Qop'>()) & 340282366920938463463374607431768211455;
v144 = (v144 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | [D] unsupported_<class 'pyvex.expr.Qop'>()) & 340282366920938463463374607431768211455;
} while (v145 != 0x400000);
v149 = (v139 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v146, (uint128_t)v144)) & 340282366920938463463374607431768211455;
v150 = v140 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
if (((CmpF((unsigned long long)v150, (unsigned long long)v149) & 69 | (char)((CmpF((unsigned long long)v150, (unsigned long long)v149) & 69) >> 6)) & 1) == 1)
v149 = (v149 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | UnaryOp Sqrt) & 340282366920938463463374607431768211455;
else
sqrt((unsigned long long)v149);
v151 = v150 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
if (((char)((CmpF((unsigned long long)v149, (unsigned long long)v151) & 69) >> 2) & 1) || (v152 = 0, ((char)(CmpF((unsigned long long)v149, (unsigned long long)v151)) & 64)))
v152 = (unsigned long long)v96 | (unsigned long long)(v96 >> 64) * 0;
v18.allocator<char> const&) (.constprop.0)("cosine_avx");
v24.ScopedTimer(v18, &v41);
v18._M_dispose();
v153 = v146 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v154 = 0;
v155 = v153;
v156 = v153;
while (true)
{
v154 += 8;
if (v154 == 4194312)
break;
v151 = *((int256_t *)(-32 + (char *)v88 + 0x4 * v154));
v149 = *((int256_t *)(*((long long *)&v57) + v154 * 4 - 32));
}
v157 = (v149 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v156 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v158 = (v156 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v156 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v159 = (v158 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v157 + v158) & 340282366920938463463374607431768211455;
v160 = (v157 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v159 >> 96) CONCAT (unsigned int)((uint128_t)v159 >> 96) CONCAT (unsigned int)((unsigned long long)v159 >> 32) CONCAT (unsigned int)((unsigned long long)v159 >> 32))) & 340282366920938463463374607431768211455;
v161 = (v159 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v159 + v160) & 340282366920938463463374607431768211455;
v162 = (v160 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v160 >> 64) CONCAT (unsigned long long)(v161 >> 64))) & 340282366920938463463374607431768211455;
v163 = (v161 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v161, (uint128_t)v162)) & 340282366920938463463374607431768211455;
v164 = (v155 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v155 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v165 = ((v162 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v155 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v162 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v155 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v164) & 340282366920938463463374607431768211455;
v166 = (v151 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v165 >> 96) CONCAT (unsigned int)((uint128_t)v165 >> 96) CONCAT (unsigned int)((unsigned long long)v165 >> 32) CONCAT (unsigned int)((unsigned long long)v165 >> 32))) & 340282366920938463463374607431768211455;
v167 = (v165 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v165 + v166) & 340282366920938463463374607431768211455;
v168 = (v166 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v166 >> 64) CONCAT (unsigned long long)(v167 >> 64))) & 340282366920938463463374607431768211455;
v169 = (v167 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v167, (uint128_t)v168)) & 340282366920938463463374607431768211455;
v170 = v153 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v171 = ((v168 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v153 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v168 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v153 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v170) & 340282366920938463463374607431768211455;
v172 = ((v169 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v169 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v169 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v169 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
v173 = ...;
v174 = ((v173 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v173 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v173 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v173 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
v175 = (v172 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v172, (uint128_t)v174)) & 340282366920938463463374607431768211455;
v176 = v174 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
if (((CmpF((unsigned long long)v176, (unsigned long long)v175) & 69 | (char)((CmpF((unsigned long long)v176, (unsigned long long)v175) & 69) >> 6)) & 1) == 1)
{
v177 = (v175 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | UnaryOp Sqrt) & 340282366920938463463374607431768211455;
v178 = v176 & 340282366920938463463374607431768211455;
v179 = v164 & 340282366920938463463374607431768211455;
v233 = v163 & 340282366920938463463374607431768211455;
v234 = v141 & 340282366920938463463374607431768211455;
v235 = v142 & 340282366920938463463374607431768211455;
v301 = v143 & 340282366920938463463374607431768211455;
}
else
{
*((uint128_t *)&v35) = v163;
v177 = v175 & 340282366920938463463374607431768211455;
v178 = v176 & 340282366920938463463374607431768211455;
v179 = v164 & 340282366920938463463374607431768211455;
v234 = v141 & 340282366920938463463374607431768211455;
v235 = v142 & 340282366920938463463374607431768211455;
v301 = v143 & 340282366920938463463374607431768211455;
sqrt((unsigned long long)v177);
v233 = (v163 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)v35) & 340282366920938463463374607431768211455;
}
v336 = v178 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
if (((char)((CmpF((unsigned long long)v177, (unsigned long long)v336) & 69) >> 2) & 1) || (v180 = 0, ((char)(CmpF((unsigned long long)v177, (unsigned long long)v336)) & 64)))
{
v233 = ((v233 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v233 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v233 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v233 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
v177 = (v177 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | DivV((uint128_t)v233, (uint128_t)v177 & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v179 = (v96 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v177 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
v180 = (unsigned long long)v179;
}
g_408040.char_traits<char>>("SAXPY scalar: checksum=", 0x18, v181);
v182 = (v179 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v116 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v183 = v182;
v184 = v182 >> 64;
v185 = g_408040._M_insert<double>(v183 | v184 * 0);
v185.char_traits<char>>(" time=", 0x7, v181);
v185._M_insert<double>(v38).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("SAXPY AVX : checksum=", 0x18, v181);
v187 = (v182 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v129 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v188 = v187;
v189 = v187 >> 64;
v190 = g_408040._M_insert<double>(v188 | v189 * 0);
v190.char_traits<char>>(" time=", 0x7, v181);
v191 = ...;
v190._M_insert<double>((unsigned long long)v191).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Cosine scalar: value=", 0x15, v181);
v193 = (v187 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v152) & 340282366920938463463374607431768211455;
v194 = v193;
v195 = v193 >> 64;
v196 = g_408040._M_insert<double>(v194 | v195 * 0);
v196.char_traits<char>>(" time=", 0x7, v181);
v196._M_insert<double>(v40).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Cosine AVX : value=", 0x15, v181);
v198 = (v193 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)v180) & 340282366920938463463374607431768211455;
v199 = v198;
v200 = v198 >> 64;
v201 = g_408040._M_insert<double>(v199 | v200 * 0);
v201.char_traits<char>>(" time=", 0x7, v181);
v202 = ...;
v201._M_insert<double>((unsigned long long)v202).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>&, char const*) (.isra.0)("--------------------------------------------------------\n\n");
v64 = 4638564681600;
v66.allocator<float> const&) (.constprop.0)(0x1fa400);
v68 = 4638564681600;
v69.allocator<float> const&) (.constprop.0)(0x1fa400);
v70 = 4638564681600;
v71.allocator<float> const&) (.constprop.0)(0x1fa400);
v204 = v66;
v205 = *((long long *)&v67);
if (v204 != v205)
{
v336 = v206 & 340282366920938463463374607431768211455;
v207 = v204;
do
{
v198 = v198 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v208 = v207 + 4;
v209 = (v92 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v198 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
*((unsigned int *)&v208[4]) = MulV((uint128_t)v209, (uint128_t)v336);
v207 = v208;
} while (v207 != v205);
}
g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 2: 2D 5-point blur on 1080p image ===\n");
v42 = 0;
v43 = 0;
v18.allocator<char> const&) (.constprop.0)("blur_scalar");
v24.ScopedTimer(v18, &v42);
v18._M_dispose();
v210 = (int)v64;
v211 = v210 * 4;
v30 = v210;
v28 = *((long long *)&v69);
memcpy(*((long long *)&v69), v204, v211);
v212 = v65 - 1;
v213 = v204 + v212 * v211;
v214 = (int)v68;
v23 = v214 * 4;
memcpy(v212 * v23 + v28, v213, v211);
if (v26 > 1)
{
if ((unsigned int)v214 == 1 && !((v215 = 1, v216 = v28 + 4, v32 != 1)))
{
do
{
v215 += 1;
v216 += 4;
*((unsigned int *)(v216 - 4)) = (unsigned int)v92;
*((unsigned int *)(v216 - 4)) = (unsigned int)v92;
} while (v26 != v215);
v9 = v204 + v211;
v218 = v23 + v28;
}
else
{
v219 = v23;
v220 = 1;
v9 = v204 + v211;
v218 = v28 + v219;
v221 = v218;
do
{
v220 += 1;
*(v221) = (unsigned int)v92;
v221[1 + v30] = (unsigned int)v92;
v221 = (char *)v221 + v219;
} while (v26 != v220);
}
v223 = v30;
v224 = v206 & 340282366920938463463374607431768211455;
v1 = v212;
v225 = v32 - 2;
v13 = v225;
v13 |= v14 & 0xffffffff00000000;
v226 = v225 & 0xfffffff8;
v12 = (v225 >> 3) * 32;
v227 = v9;
v228 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v19 = 1 - v223;
v5 = v226 + 1;
v229 = v227 + v211;
v4 = v226;
v230 = v204;
v0 = v213;
v231 = v211;
v232 = v223;
do
{
v27 = v232;
v232 += v30;
if (v32 <= 2)
continue;
if (v21 > 2 && !((v236 = v218 + 4, v237 = v227 + 4, !((char)(char)(v236 - v237 <= 32 ^ 1) & (char)(char)(v236 - (v230 + 8) <= 24 ^ 1)) || v236 - (v229 + 8) <= 24)))
{
if (v21 > 6)
{
v17 = v230;
v238 = 0;
do
{
v239 = (*((int256_t *)(v237 + (char *)v238)) + *((int256_t *)(v227 + v238)) + *((int256_t *)(v227 + 8 + v238)) + *((int256_t *)(v230 + 4 + v238)) + *((int256_t *)(v229 + 1 + v238))) * v228;
v240 = (v107 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v107 & 340282366920938463463374607431768211455, (uint128_t)v239)) & 340282366920938463463374607431768211455;
v241 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v239 >> 32) CONCAT (unsigned int)((unsigned long long)v239 >> 32) CONCAT (unsigned int)((unsigned long long)v239 >> 32) CONCAT (unsigned int)((unsigned long long)v239 >> 32))) & 340282366920938463463374607431768211455;
v242 = (*((int256_t *)(v237 + (char *)v238)) & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96))) & 340282366920938463463374607431768211455;
*((void*)(v236 + (char *)v238)) = v239;
v238 += 32;
v243 = (v240 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v240, (uint128_t)v241)) & 340282366920938463463374607431768211455;
v235 = (v241 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 96) CONCAT (unsigned int)((uint128_t)v239 >> 64) CONCAT (unsigned int)((uint128_t)v239 >> 64))) & 340282366920938463463374607431768211455;
v244 = (v239 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v239 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v245 = (v243 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v243, (uint128_t)v235)) & 340282366920938463463374607431768211455;
v246 = (v245 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v245, (uint128_t)v242)) & 340282366920938463463374607431768211455;
v247 = (v242 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v244 >> 32) CONCAT (unsigned int)((unsigned long long)v244 >> 32) CONCAT (unsigned int)((unsigned long long)v244 >> 32) CONCAT (unsigned int)((unsigned long long)v244 >> 32))) & 340282366920938463463374607431768211455;
v248 = (v246 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v246, (uint128_t)v244)) & 340282366920938463463374607431768211455;
v249 = (v248 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v248, (uint128_t)v247)) & 340282366920938463463374607431768211455;
v250 = (v247 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 64) CONCAT (unsigned int)((uint128_t)v244 >> 64))) & 340282366920938463463374607431768211455;
v234 = (v250 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v249, (uint128_t)v250) & 340282366920938463463374607431768211455, ((unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96) CONCAT (unsigned int)((uint128_t)v244 >> 96)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
} while (v12 != v238);
v230 = v17;
if (!((char)v13 & 7))
continue;
v251 = v8;
if (v10 - 3 > 2)
{
v252 = v4;
v253 = v5;
}
else
{
v254 = v5;
LABEL_4021cc:
v266 = v254;
v267 = v266 * 4;
v268 = v227 + v267;
v17 = v266 + 1;
v15 = v267 + 4;
v269 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *(v268))) & 340282366920938463463374607431768211455;
v270 = v15 + v227;
v27 = v268;
v271 = (v269 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v269, *(v270))) & 340282366920938463463374607431768211455;
v272 = (v271 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v271, *((int *)((char *)v230 + v267)))) & 340282366920938463463374607431768211455;
v273 = (v272 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v272, *((int *)((char *)v204 + 4 * v232 + 4 * v266)))) & 340282366920938463463374607431768211455;
v217 = (v273 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v273, (uint128_t)v224)) & 340282366920938463463374607431768211455;
v233 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, v33)) & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v28 + 4 * v266 + 4 * v35)) = v217;
if (v16 > v254 + 1)
{
v274 = v254 + 2;
v11 = v267 + 8;
v275 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *(v27))) & 340282366920938463463374607431768211455;
v276 = (v275 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v275, *((int *)(v11 + (char *)v227)))) & 340282366920938463463374607431768211455;
v277 = (v276 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v276, *((int *)((char *)v230 + v15)))) & 340282366920938463463374607431768211455;
v278 = (v277 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v277, *((int *)((char *)v204 + 4 * v232 + 4 * v17)))) & 340282366920938463463374607431768211455;
v217 = (v278 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v278, (uint128_t)v224)) & 340282366920938463463374607431768211455;
v234 = (v234 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, (uint128_t)v233)) & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v28 + 4 * v35 + 4 * v17)) = v217;
if (v16 > v274)
{
v279 = v274;
v280 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *(v270))) & 340282366920938463463374607431768211455;
v281 = (v280 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v280, *((int *)(12 + (char *)v227 + v267)))) & 340282366920938463463374607431768211455;
v282 = (v281 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v281, *((int *)((char *)v230 + v11)))) & 340282366920938463463374607431768211455;
v283 = (v282 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v282, *((int *)((char *)v204 + 4 * v232 + 4 * v279)))) & 340282366920938463463374607431768211455;
v217 = (v283 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v283, (uint128_t)v224)) & 340282366920938463463374607431768211455;
v233 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, (uint128_t)v234)) & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v28 + 4 * v279 + 4 * v35)) = v217;
goto LABEL_402330;
}
}
}
}
else
{
v251 = v14;
v252 = 0;
v253 = 1;
}
v255 = v27 + v252;
v256 = v255 + 1;
v257 = (v217 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (*((int128_t *)(-4 + (char *)v204 + 4 * v256)) & 340282366920938463463374607431768211455) + *((int128_t *)((char *)v204 + 4 * v256))) & 340282366920938463463374607431768211455;
v258 = (v257 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v257 + *((int128_t *)(4 + (char *)v204 + 4 * v256))) & 340282366920938463463374607431768211455;
v259 = (v258 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v258 + *((int128_t *)((char *)v204 + 4 * v255 + 4 * v19))) & 340282366920938463463374607431768211455;
v260 = (v259 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v259 + *((int128_t *)(4 + (char *)v204 + 4 * v232 + 4 * v252))) & 340282366920938463463374607431768211455;
v261 = (v260 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v260 * ((v198 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)v224 CONCAT (unsigned int)v224 CONCAT (unsigned int)v224 CONCAT (unsigned int)v224)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v262 = (v107 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v107 & 340282366920938463463374607431768211455, (uint128_t)v261)) & 340282366920938463463374607431768211455;
v263 = (v234 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v261 >> 32) CONCAT (unsigned int)((unsigned long long)v261 >> 32) CONCAT (unsigned int)((unsigned long long)v261 >> 32) CONCAT (unsigned int)((unsigned long long)v261 >> 32))) & 340282366920938463463374607431768211455;
*((uint128_t *)(4 + (char *)v28 + 4 * v35 + 4 * v252)) = v261;
v254 = (v251 & 0xfffffffc) + v253;
v264 = (v262 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v262, (uint128_t)v263)) & 340282366920938463463374607431768211455;
v234 = (v263 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 64) CONCAT (unsigned int)((uint128_t)v261 >> 64))) & 340282366920938463463374607431768211455;
v217 = (v261 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96) CONCAT (unsigned int)((uint128_t)v261 >> 96))) & 340282366920938463463374607431768211455;
v265 = (v264 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v264, (uint128_t)v234)) & 340282366920938463463374607431768211455;
v233 = (v265 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v265, (uint128_t)v217)) & 340282366920938463463374607431768211455;
if (!(v251 & 3))
continue;
goto LABEL_4021cc;
}
else
{
v284 = 1;
do
{
v285 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)(-4 + (char *)v227 + 4 * v284)))) & 340282366920938463463374607431768211455;
v286 = (v285 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v285, *((int *)(4 + (char *)v227 + 4 * v284)))) & 340282366920938463463374607431768211455;
v287 = (v286 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v286, *((int *)((char *)v230 + 4 * v284)))) & 340282366920938463463374607431768211455;
v288 = (v287 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v287, v229[v284])) & 340282366920938463463374607431768211455;
v217 = (v288 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v288, (uint128_t)v224)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v217, v33)) & 340282366920938463463374607431768211455;
v218[v284] = v217;
v284 += 1;
} while (v284 != v32 - 1);
}
LABEL_402330:
v227 += v231;
v230 += v231;
v229 = (char *)v229 + v231;
v218 = (char *)v218 + v23;
} while (v26 != v29);
v211 = v231;
v212 = v1;
v213 = v0;
v202 = v217 & 340282366920938463463374607431768211455;
v336 = v224 & 340282366920938463463374607431768211455;
v233 &= 340282366920938463463374607431768211455;
v234 &= 340282366920938463463374607431768211455;
v235 &= 340282366920938463463374607431768211455;
v301 &= 340282366920938463463374607431768211455;
}
v18.allocator<char> const&) (.constprop.0)("blur_avx");
v24.ScopedTimer(v18, &v43);
v18._M_dispose();
v27 = *((long long *)&v71);
memcpy(*((long long *)&v71), v204, v211);
v289 = (int)v70;
v290 = v289 * 4;
v8 = v289;
memcpy(v27 + v290 * v212, v213, v211);
if (v26 > 1)
{
v291 = v36;
if (v291 == 1 && !((v292 = v27 + 4, v32 != 1)))
{
do
{
v291 += 1;
v292 += 4;
*((unsigned int *)(v292 - 4)) = (unsigned int)v92;
*((unsigned int *)(v292 - 4)) = (unsigned int)v92;
} while (v26 != v291);
v9 = v204 + v211;
}
else
{
v294 = 1;
v9 = v204 + v211;
v295 = v27 + v290;
do
{
v294 += 1;
*(v295) = (unsigned int)v92;
v295[1 + v30] = (unsigned int)v92;
v295 = (char *)v295 + v290;
} while (v26 != v294);
}
v29 = 0;
v296 = v96 & 340282366920938463463374607431768211455;
v297 = v30;
v298 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v299 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)v296 CONCAT (unsigned int)v296 CONCAT (unsigned int)v296 CONCAT (unsigned int)v296)) & 340282366920938463463374607431768211455;
v300 = (v234 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | 4489188110467124429) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455;
v28 = v8;
do
{
v302 = v297;
v303 = v302 * 4;
v304 = v204 + v303;
v305 = v302 + v30;
v306 = v204 + v29 * 4;
v307 = v303 + v9;
v308 = v27 + v28 * 4;
if (v32 > 8)
{
v309 = 9;
while (true)
{
v301 = *((int256_t *)&v304[8 + v309]);
v310 = v309 + 8;
v311 = (v301 + *((int256_t *)&v304[9 + v309]) + *((int256_t *)&v304[7 + v309]) + *((int256_t *)&v306[8 + v309]) + *((int256_t *)&v307[8 + v309])) * v298;
v312 = (v336 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v311 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
*((void*)&v308[8 + v309]) = v311;
v313 = (v311 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v311 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v314 = (v313 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v312 + v313) & 340282366920938463463374607431768211455;
v315 = (v312 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v314 >> 96) CONCAT (unsigned int)((uint128_t)v314 >> 96) CONCAT (unsigned int)((unsigned long long)v314 >> 32) CONCAT (unsigned int)((unsigned long long)v314 >> 32))) & 340282366920938463463374607431768211455;
v316 = (v314 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v314 + v315) & 340282366920938463463374607431768211455;
v336 = (v315 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v315 >> 64) CONCAT (unsigned long long)(v316 >> 64))) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v316, (uint128_t)v336) & 340282366920938463463374607431768211455, v37)) & 340282366920938463463374607431768211455;
if (v310 == (v32 - 9 >> 3) * 8 + 17)
break;
v309 = v310;
}
v317 = v309 & 4294967295;
}
else
{
v317 = 1;
}
if ((unsigned int)v317 >= v20)
continue;
v318 = v317 & 4294967295;
v319 = v32 - v317;
if (v319 != 2)
{
v13 = v302 + v318;
v320 = v13 * 4;
v15 = v318 + v28;
v321 = v27 + v15 * 4;
v16 = v204 + v320;
v12 = v318 + v29;
v11 = v305 + v318;
v25 = v11 * 4 + 4;
v17 = v12 * 4 + 4;
if (!((v321 - (v204 + v25) <= 8 ^ 1) & v21) || v321 - v16 <= 16)
goto LABEL_404430;
v322 = v319 - 1;
if (v319 - 2 > 2)
{
v323 = (v293 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | *(v16)) & 340282366920938463463374607431768211455;
v324 = (v323 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v323 + *((int128_t *)(-4 + (char *)v204 + v320))) & 340282366920938463463374607431768211455;
v325 = (v324 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v324 + *((int128_t *)(4 + (char *)v204 + v320))) & 340282366920938463463374607431768211455;
v326 = (v325 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v325 + *((int128_t *)(-4 + (char *)v204 + v17))) & 340282366920938463463374607431768211455;
v327 = (v326 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v326 + *((int128_t *)(-4 + (char *)v204 + v25))) & 340282366920938463463374607431768211455;
v328 = (v327 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v327 * v299) & 340282366920938463463374607431768211455;
v329 = (v206 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v328)) & 340282366920938463463374607431768211455;
v330 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v328 >> 32) CONCAT (unsigned int)((unsigned long long)v328 >> 32) CONCAT (unsigned int)((unsigned long long)v328 >> 32) CONCAT (unsigned int)((unsigned long long)v328 >> 32))) & 340282366920938463463374607431768211455;
*(v321) = v328;
v331 = (v329 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v329, (uint128_t)v330)) & 340282366920938463463374607431768211455;
v332 = (v330 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 64) CONCAT (unsigned int)((uint128_t)v328 >> 64))) & 340282366920938463463374607431768211455;
v293 = (v328 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96) CONCAT (unsigned int)((uint128_t)v328 >> 96))) & 340282366920938463463374607431768211455;
v336 = (v331 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v331, (uint128_t)v332)) & 340282366920938463463374607431768211455;
v235 = (v332 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v336, (uint128_t)v293)) & 340282366920938463463374607431768211455;
if (!((char)v322 & 3))
continue;
v333 = v322 & 0xfffffffc;
v334 = (unsigned int)(v319 - v333);
v317 += v333;
v335 = v334 - 1;
if (v334 == 2)
goto LABEL_4027ea;
}
else
{
v335 = v322;
v333 = 0;
}
v337 = v13 + v333;
v338 = v337 * 4;
v339 = (v336 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)((char *)v204 + 4 * v337))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455;
v340 = (v293 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)(-4 + (char *)v204 + v338))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455;
v341 = (v339 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)(4 + (char *)v204 + v338))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455;
v342 = ((v340 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v340 + v339) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v340 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v340 + v339) & 340282366920938463463374607431768211455) + v341) & 340282366920938463463374607431768211455;
v343 = (v341 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | *((long long *)((char *)v204 + 4 * v12 + 0x4 * v333))) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 & 340282366920938463463374607431768211455;
v344 = ((v342 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v342 + v343) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((uint128_t)v342 + (uint128_t)v343 & 340282366920938463463374607431768211455) + (((uint128_t)v343 & 0xffffffffffffffff0000000000000000 | *((long long *)((char *)v204 + 4 * v11 + 0x4 * v333))) & 18446744073709551615 & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v345 = (v344 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v344 * v300) & 340282366920938463463374607431768211455;
v336 = (v206 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v345)) & 340282366920938463463374607431768211455;
*((unsigned long long *)((char *)v27 + 4 * v15 + 0x4 * v333)) = v345;
v293 = (v345 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v345 >> 96) CONCAT (unsigned int)((uint128_t)v345 >> 96) CONCAT (unsigned int)((unsigned long long)v345 >> 32) CONCAT (unsigned int)((unsigned long long)v345 >> 32))) & 340282366920938463463374607431768211455;
v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v336)) & 340282366920938463463374607431768211455;
if (!((char)v335 & 1))
continue;
v317 += v335 & 4294967294;
LABEL_4027ea:
v346 = v317 & 4294967295;
v347 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, v304[v346])) & 340282366920938463463374607431768211455;
v348 = (v347 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v347, v304[1 + v346])) & 340282366920938463463374607431768211455;
v349 = (v348 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v348, v306[v346])) & 340282366920938463463374607431768211455;
v350 = (v349 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v349, v307[v346])) & 340282366920938463463374607431768211455;
v293 = (v350 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v350, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, v37)) & 340282366920938463463374607431768211455;
v308[v346] = v293;
continue;
}
else
{
LABEL_404430:
v351 = v318 * 4;
v352 = v304 + v351 + 4;
v353 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)(v304 + v351)))) & 340282366920938463463374607431768211455;
v354 = (v353 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v353, *((int *)v352))) & 340282366920938463463374607431768211455;
v355 = (v354 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v354, v306[v318])) & 340282366920938463463374607431768211455;
v356 = (v355 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v355, v307[v318])) & 340282366920938463463374607431768211455;
v293 = (v356 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v356, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, v37)) & 340282366920938463463374607431768211455;
v308[v318] = v293;
if (v20 > (unsigned int)v317 + 1)
{
v357 = v304 + v351 + 8;
v358 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v352))) & 340282366920938463463374607431768211455;
v359 = (v358 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v358, *((int *)v357))) & 340282366920938463463374607431768211455;
v360 = (v359 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v359, *((int *)(v306 + v351 + 4)))) & 340282366920938463463374607431768211455;
v361 = (v360 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v360, *((int *)(v307 + v351 + 4)))) & 340282366920938463463374607431768211455;
v293 = (v361 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v361, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v301)) & 340282366920938463463374607431768211455;
*((unsigned int *)(v308 + v351 + 4)) = v293;
if ((unsigned int)v317 + 2 < v20)
{
v362 = v304 + v351 + 12;
v363 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v357))) & 340282366920938463463374607431768211455;
v364 = (v363 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v363, *((int *)v362))) & 340282366920938463463374607431768211455;
v365 = (v364 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v364, *((int *)(v306 + v351 + 8)))) & 340282366920938463463374607431768211455;
v366 = (v365 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v365, *((int *)(v307 + v351 + 8)))) & 340282366920938463463374607431768211455;
v293 = (v366 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v366, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v235)) & 340282366920938463463374607431768211455;
*((unsigned int *)(v308 + v351 + 8)) = v293;
if (v20 > (unsigned int)v317 + 3)
{
v367 = v304 + v351 + 16;
v368 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v362))) & 340282366920938463463374607431768211455;
v369 = (v368 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v368, *((int *)v367))) & 340282366920938463463374607431768211455;
v370 = (v369 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v369, *((int *)(v306 + v351 + 12)))) & 340282366920938463463374607431768211455;
v371 = (v370 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v370, *((int *)(v307 + v351 + 12)))) & 340282366920938463463374607431768211455;
v293 = (v371 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v371, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v235)) & 340282366920938463463374607431768211455;
*((unsigned int *)(v308 + v351 + 12)) = v293;
if (v20 > (unsigned int)v317 + 4)
{
v372 = v304 + v351 + 20;
v373 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v367))) & 340282366920938463463374607431768211455;
v374 = (v373 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v373, *((int *)v372))) & 340282366920938463463374607431768211455;
v375 = (v374 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v374, *((int *)(v306 + v351 + 16)))) & 340282366920938463463374607431768211455;
v376 = (v375 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v375, *((int *)(v307 + v351 + 16)))) & 340282366920938463463374607431768211455;
v293 = (v376 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v376, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v301)) & 340282366920938463463374607431768211455;
*((unsigned int *)(v308 + v351 + 16)) = v293;
if (v20 > (unsigned int)v317 + 5)
{
v377 = v304 + v351 + 24;
v378 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v372))) & 340282366920938463463374607431768211455;
v379 = (v378 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v378, *((int *)v377))) & 340282366920938463463374607431768211455;
v380 = (v379 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v379, *((int *)(v306 + v351 + 20)))) & 340282366920938463463374607431768211455;
v381 = (v380 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v380, *((int *)(v307 + v351 + 20)))) & 340282366920938463463374607431768211455;
v293 = (v381 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v381, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v301 = (v301 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v235)) & 340282366920938463463374607431768211455;
*((unsigned int *)(v308 + v351 + 20)) = v293;
if (v20 > (unsigned int)v317 + 6)
{
v382 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, *((int *)v377))) & 340282366920938463463374607431768211455;
v383 = (v382 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v382, *((int *)(v304 + v351 + 28)))) & 340282366920938463463374607431768211455;
v384 = (v383 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v383, *((int *)(v306 + v351 + 24)))) & 340282366920938463463374607431768211455;
v385 = (v384 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v384, *((int *)(v307 + v351 + 24)))) & 340282366920938463463374607431768211455;
v293 = (v385 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v385, (uint128_t)v296)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v293, (uint128_t)v301)) & 340282366920938463463374607431768211455;
*((unsigned int *)(v308 + v351 + 24)) = v293;
}
}
}
}
}
}
}
v28 += v8;
v29 += v30;
v386 = v22;
v297 = v305;
} while (v26 != (unsigned int)v386);
v202 = v293 & 340282366920938463463374607431768211455;
v336 &= 340282366920938463463374607431768211455;
v233 = v299 & 340282366920938463463374607431768211455;
v234 = v300 & 340282366920938463463374607431768211455;
v235 &= 340282366920938463463374607431768211455;
}
g_408040.char_traits<char>>("Blur scalar: checksum=", 0x16, v386);
v387 = v202 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v33;
v388 = v387 >> 64;
v389 = g_408040._M_insert<double>((unsigned long long)v387 | v388 * 0);
v389.char_traits<char>>(" time=", 0x7, v181);
v389._M_insert<double>(v42).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Blur AVX : checksum=", 0x16, v181);
v391 = ((v387 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v388 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v42) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v37;
v392 = v391 >> 64;
v393 = g_408040._M_insert<double>((unsigned long long)v391 | v392 * 0);
v393.char_traits<char>>(" time=", 0x7, v181);
v393._M_insert<double>(v43).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Checksum delta (AVX - scalar): ", 0x1f, v181);
v395 = (((v391 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v392 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v43) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(SubV(v96 & 340282366920938463463374607431768211455, v33))) & 340282366920938463463374607431768211455;
v396 = ((v395 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v395 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v395 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v395 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
g_408040._M_insert<double>((unsigned long long)v396).char_traits<char>>&, char const*) (.isra.0)("\n");
g_408040.char_traits<char>>&, char const*) (.isra.0)("--------------------------------------------------------\n\n");
v72.ComplexSoA(0x40000);
v75.ComplexSoA(0x40000);
v21 = &v77;
v77.ComplexSoA(0);
v19 = &v79;
v79.ComplexSoA(0);
v22 = &v81;
v81.ComplexSoA(0);
v17 = &v83;
v83.ComplexSoA(0);
fill_complex(&v72, 305441741);
fill_complex(&v75, 2557935324);
v58.allocator<float> const&) (.constprop.0)(16);
v398 = 0;
v399 = v336 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
while (true)
{
v400 = (v396 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | SubV((uint128_t)v399 & 340282366920938463463374607431768211455, 0x40f00000)) & 340282366920938463463374607431768211455;
v401 = (v400 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v400, 0x3e000000)) & 340282366920938463463374607431768211455;
v402 = v105 & 340282366920938463463374607431768211455;
v403 = (v401 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV(((uint128_t)v401 ^ 0x80000000) & 340282366920938463463374607431768211455, (uint128_t)v401)) & 340282366920938463463374607431768211455;
expf((unsigned int)v403);
v404 = (v403 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v403, v34)) & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v58 + 0x4 * v398)) = v404;
v398 += 1;
if (v398 == 16)
break;
v405 = (v404 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((v206 & 18446744073709551615 | ((uint128_t)((v96 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455) >> 64) & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455, 0x3f000000))) & 340282366920938463463374607431768211455;
v406 = (v405 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v405, 1078530011)) & 340282366920938463463374607431768211455;
v396 = (v406 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v406, 0x3d800000)) & 340282366920938463463374607431768211455;
cosf((unsigned int)v396);
v399 = v206 & 340282366920938463463374607431768211455;
}
v27 = v58;
g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 3: Complex multiply + FIR convolution ===\n");
v44 = 0;
v45 = 0;
v18.allocator<char> const&) (.constprop.0)("complex_mul_scalar");
v24.ScopedTimer(v18, &v44);
v18._M_dispose();
v407 = v72;
v11 = *((long long *)&v73);
v408 = *((long long *)&v73) - v407;
v33 = v408;
v409 = v408 >> 2;
v21.resize(v409);
v78.resize(v409);
if (v408)
{
v33 = v408;
v410 = *((long long *)&v77);
v411 = *((long long *)&v78);
if (...)
{
v413 = (v33 ? v409 : 1);
if (v33 > 28)
{
v414 = 0;
v415 = [D] unsupported_<class 'pyvex.expr.Qop'>();
do
{
v416 = *((int256_t *)(*((long long *)&v74) + (char *)v414));
v417 = *((int256_t *)(v407 + v414));
*((void*)(v410 + v414)) = v206;
*((void*)(v411 + v414)) = v92;
v414 += 32;
v418 = (v92 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v92 & 340282366920938463463374607431768211455, (uint128_t)v206)) & 340282366920938463463374607431768211455;
v419 = (v417 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v206 >> 32) CONCAT (unsigned int)((unsigned long long)v206 >> 32) CONCAT (unsigned int)((unsigned long long)v206 >> 32) CONCAT (unsigned int)((unsigned long long)v206 >> 32))) & 340282366920938463463374607431768211455;
v420 = (v416 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96))) & 340282366920938463463374607431768211455;
v421 = (v418 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v418, (uint128_t)v419)) & 340282366920938463463374607431768211455;
v424 = (v421 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v421, ((unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 96) CONCAT (unsigned int)((uint128_t)v206 >> 64) CONCAT (unsigned int)((uint128_t)v206 >> 64)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v425 = (v424 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v424, (uint128_t)v420)) & 340282366920938463463374607431768211455;
v426 = (v420 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32) CONCAT (unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32) CONCAT (unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32) CONCAT (unsigned int)((unsigned long long)(v206 / 0x100000000000000000000000000000000) >> 32))) & 340282366920938463463374607431768211455;
} while ((v427 = (v425 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v425, (uint128_t)v423))) & 340282366920938463463374607431768211455, v428 = (v427 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v427, (uint128_t)v426))) & 340282366920938463463374607431768211455, v429 = (v426 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)((unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 64) CONCAT (unsigned int)((uint128_t)v423 >> 64))) & 340282366920938463463374607431768211455, v430 = (v428 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v428, (uint128_t)v429))) & 340282366920938463463374607431768211455, v431 = (v429 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(AddV((uint128_t)v430, ((unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96) CONCAT (unsigned int)((uint128_t)v423 >> 96)) & 340282366920938463463374607431768211455))) & 340282366920938463463374607431768211455, v414 != (v413 >> 3) * 32));
v432 = v413 & 18446744073709551608;
if (((char)v413 & 7))
{
v402 = v415 & 340282366920938463463374607431768211455;
v233 = v431 & 340282366920938463463374607431768211455;
v234 = v422 & 340282366920938463463374607431768211455;
v235 &= 340282366920938463463374607431768211455;
LABEL_402e23:
v433 = v413 - v432;
if (v433 - 1 > 2)
{
v434 = v432 * 4;
*((void*)(v410 + v434)) = v92 & 340282366920938463463374607431768211455;
*((void*)(v411 + v434)) = v206 & 340282366920938463463374607431768211455;
v432 += v433 & 18446744073709551612;
v435 = v92 & 340282366920938463463374607431768211455;
v402 = ((((v402 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v435 >> 32) CONCAT (unsigned int)((unsigned long long)v435 >> 32) CONCAT (unsigned int)((unsigned long long)v435 >> 32) CONCAT (unsigned int)((unsigned long long)v435 >> 32))) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 64) CONCAT (unsigned int)((uint128_t)v435 >> 64))) & 340282366920938463463374607431768211455;
v404 = (v435 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96) CONCAT (unsigned int)((uint128_t)v435 >> 96))) & 340282366920938463463374607431768211455;
if (!((char)v433 & 3))
goto LABEL_40308b;
}
v233 = v107 & 340282366920938463463374607431768211455;
v436 = v432 * 4;
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v233)) & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v410 + 4 * v432)) = (unsigned int)v206;
v437 = v92 & 340282366920938463463374607431768211455;
v438 = v96 & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v411 + 4 * v432)) = v437;
v404 = (v437 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v437, v438)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, v30)) & 340282366920938463463374607431768211455;
if (v432 + 1 < v409)
{
v234 = v109 & 340282366920938463463374607431768211455;
v233 = v107 & 340282366920938463463374607431768211455;
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455;
v439 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(4 + (char *)v410 + v436)) = (unsigned int)v206;
*((unsigned int *)(4 + (char *)v411 + v436)) = v439;
v404 = (v439 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v439, v438)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235)) & 340282366920938463463374607431768211455;
if (v432 + 2 < v409)
{
v234 = v109 & 340282366920938463463374607431768211455;
v233 = v107 & 340282366920938463463374607431768211455;
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455;
v440 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(8 + (char *)v410 + v436)) = (unsigned int)v206;
*((unsigned int *)(8 + (char *)v411 + v436)) = v440;
v404 = (v440 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v440, v438)) & 340282366920938463463374607431768211455;
}
}
}
else
{
v404 = v430 & 340282366920938463463374607431768211455;
v402 = v415 & 340282366920938463463374607431768211455;
v233 = v431 & 340282366920938463463374607431768211455;
v234 = v422 & 340282366920938463463374607431768211455;
v235 &= 340282366920938463463374607431768211455;
}
}
else
{
v432 = 0;
goto LABEL_402e23;
}
}
else
{
v234 = v109 & 340282366920938463463374607431768211455;
v441 = 0;
do
{
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v107 & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v442 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v410 + 0x4 * v441)) = (unsigned int)v206;
*((unsigned int *)((char *)v411 + 0x4 * v441)) = v442;
v404 = (v442 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v442, (uint128_t)v234)) & 340282366920938463463374607431768211455;
v441 += 1;
} while (v441 < v409);
}
}
LABEL_40308b:
v18.allocator<char> const&) (.constprop.0)("complex_mul_avx");
v24.ScopedTimer(v18, &v45);
v18._M_dispose();
v19.resize(v409);
v80.resize(v409);
if (v33 > 28)
{
v443 = 8;
v444 = *((long long *)&v80);
v402 = [D] unsupported_<class 'pyvex.expr.Qop'>();
while (true)
{
v445 = *((int256_t *)(*((long long *)&v74) + v443 * 4 - 32));
v446 = v443 + 8;
v233 = *((int256_t *)(-32 + (char *)v407 + 4 * v443));
*((void*)(*((long long *)&v79) + v443 * 4 - 32)) = v92;
*((void*)(v444 + v443 * 4 - 32)) = v206;
v447 = (v92 * v402 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v206 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v448 = (v206 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v206 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v449 = (v447 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v447 + v448) & 340282366920938463463374607431768211455;
v450 = (v448 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v449 >> 96) CONCAT (unsigned int)((uint128_t)v449 >> 96) CONCAT (unsigned int)((unsigned long long)v449 >> 32) CONCAT (unsigned int)((unsigned long long)v449 >> 32))) & 340282366920938463463374607431768211455;
v234 = (v445 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v449 + (uint128_t)v450 & 340282366920938463463374607431768211455, ((unsigned long long)(v450 >> 64) CONCAT (unsigned long long)(((v449 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v449 + v450) & 340282366920938463463374607431768211455) >> 64)) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455, v34)) & 340282366920938463463374607431768211455;
if (v409 < v446)
break;
v443 = v446;
}
}
else
{
v443 = 0;
}
if (v443 < v409)
{
v451 = *((long long *)&v74);
v452 = v443 * 4;
v453 = *((long long *)&v75);
v31 = *((long long *)&v80);
v454 = *((long long *)&v79);
v16 = v409 - v443;
v8 = v16 - 1;
if (v8 > 2)
{
v452 = v443 * 4;
v455 = v452 + 4;
v28 = v454 + v452;
v29 = v31 + v452;
v9 = *((long long *)&v76) + v455;
if (!(((char *)v29 - v9 <= 24 ^ 1) & (char *)v28 - v9 > 24 & v28 - ((char *)v453 + v455) > 24 & v28 - (v451 + v455) > 24 & v29 - (v407 + v455) > 24 & v28 - (v407 + v455) > 24 & v29 - (v451 + v455) > 24 & v29 - ((char *)v453 + v455) > 24) || v29 - (v455 + v454) <= 24)
goto LABEL_404661;
if (v8 > 6)
{
v233 = *((int256_t *)(v451 + v452));
v456 = v16;
v457 = v456 & 18446744073709551608;
v458 = v443 + v457;
v443 = v458;
*((void*)v28) = v92;
v459 = [D] unsupported_<class 'pyvex.expr.Qop'>();
*((void*)v29) = v206;
v460 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v92)) & 340282366920938463463374607431768211455;
v461 = (v206 * v459 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96))) & 340282366920938463463374607431768211455;
v462 = (v460 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 96) CONCAT (unsigned int)((uint128_t)v92 >> 64) CONCAT (unsigned int)((uint128_t)v92 >> 64))) & 340282366920938463463374607431768211455;
v463 = (v92 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v92 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v464 = (v462 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v462, AddV(((unsigned int)((unsigned long long)v92 >> 32) CONCAT (unsigned int)((unsigned long long)v92 >> 32) CONCAT (unsigned int)((unsigned long long)v92 >> 32) CONCAT (unsigned int)((unsigned long long)v92 >> 32)) & 340282366920938463463374607431768211455, (uint128_t)v460) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v465 = (v461 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v461, (uint128_t)v464)) & 340282366920938463463374607431768211455;
v466 = (v464 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v463 >> 32) CONCAT (unsigned int)((unsigned long long)v463 >> 32) CONCAT (unsigned int)((unsigned long long)v463 >> 32) CONCAT (unsigned int)((unsigned long long)v463 >> 32))) & 340282366920938463463374607431768211455;
v402 = (v466 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v466, AddV((uint128_t)v463, (uint128_t)v465) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v404 = (v463 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v463 >> 96) CONCAT (unsigned int)((uint128_t)v463 >> 96) CONCAT (unsigned int)((uint128_t)v463 >> 96) CONCAT (unsigned int)((uint128_t)v463 >> 96))) & 340282366920938463463374607431768211455;
if (!((char)v456 & 7))
goto LABEL_403608;
v16 = v456 - v457;
if (v16 - 1 <= 2)
goto LABEL_4034e4;
}
else
{
v458 = v443;
}
v233 = (v233 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | *((int128_t *)((char *)v451 + 4 * v458))) & 340282366920938463463374607431768211455;
v467 = v458 * 4;
v468 = (v402 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | *((int128_t *)((char *)v453 + 4 * v458))) & 340282366920938463463374607431768211455;
*((void*)(v454 + v467)) = v92 & 340282366920938463463374607431768211455;
*((void*)(v31 + v467)) = v206 & 340282366920938463463374607431768211455;
v443 += v16 & 18446744073709551612;
v469 = v92 & 340282366920938463463374607431768211455;
v402 = (((((v468 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v468 * v233) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000 CONCAT 0x3e800000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000 CONCAT 0x3f000000)) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v469 >> 32) CONCAT (unsigned int)((unsigned long long)v469 >> 32) CONCAT (unsigned int)((unsigned long long)v469 >> 32) CONCAT (unsigned int)((unsigned long long)v469 >> 32))) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 64) CONCAT (unsigned int)((uint128_t)v469 >> 64))) & 340282366920938463463374607431768211455;
v404 = (v469 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96) CONCAT (unsigned int)((uint128_t)v469 >> 96))) & 340282366920938463463374607431768211455;
if (((char)v16 & 3))
{
LABEL_4034e4:
v233 = v107 & 340282366920938463463374607431768211455;
v470 = v443 * 4;
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v233)) & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v454 + 4 * v443)) = (unsigned int)v206;
v471 = v92 & 340282366920938463463374607431768211455;
v472 = v96 & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v31 + 4 * v443)) = v471;
v404 = (v471 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v471, v472)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, v34)) & 340282366920938463463374607431768211455;
if (v443 + 1 < v409)
{
v233 = v107 & 340282366920938463463374607431768211455;
v234 = v109 & 340282366920938463463374607431768211455;
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455;
v473 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(4 + (char *)v454 + v470)) = (unsigned int)v206;
*((unsigned int *)(4 + (char *)v31 + v470)) = v473;
v404 = (v473 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v473, v472)) & 340282366920938463463374607431768211455;
if (v443 + 2 < v409)
{
v234 = v109 & 340282366920938463463374607431768211455;
v233 = v107 & 340282366920938463463374607431768211455;
v402 = (v105 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v105 & 340282366920938463463374607431768211455, (uint128_t)v234)) & 340282366920938463463374607431768211455;
*((unsigned int *)(8 + (char *)v454 + v470)) = (unsigned int)v206;
v474 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(8 + (char *)v31 + v470)) = v474;
v404 = (v474 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v474, v472)) & 340282366920938463463374607431768211455;
}
}
}
}
else
{
LABEL_404661:
v233 = v107 & 340282366920938463463374607431768211455;
v475 = v92 & 340282366920938463463374607431768211455;
v402 = v105 & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v454 + 4 * v443)) = (unsigned int)v206;
*((unsigned int *)((char *)v31 + 4 * v443)) = v475;
v404 = (v475 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v475, (uint128_t)v402)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, v34)) & 340282366920938463463374607431768211455;
if (v443 + 1 < v409)
{
v233 = v107 & 340282366920938463463374607431768211455;
v234 = v109 & 340282366920938463463374607431768211455;
v476 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 4)) = (unsigned int)v206;
*((unsigned int *)(v31 + v452 + 4)) = v476;
v404 = (v476 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v476, (uint128_t)v402)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235)) & 340282366920938463463374607431768211455;
if (v443 + 2 < v409)
{
v234 = v109 & 340282366920938463463374607431768211455;
v233 = v107 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 8)) = (unsigned int)v206;
v477 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v31 + v452 + 8)) = v477;
v404 = (v477 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v477, (uint128_t)v402)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235)) & 340282366920938463463374607431768211455;
if (v443 + 3 < v409)
{
v233 = v107 & 340282366920938463463374607431768211455;
v234 = v109 & 340282366920938463463374607431768211455;
v478 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 12)) = (unsigned int)v206;
*((unsigned int *)(v31 + v452 + 12)) = v478;
v404 = (v478 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v478, (uint128_t)v402)) & 340282366920938463463374607431768211455;
if (v443 + 4 < v409)
{
v233 = v107 & 340282366920938463463374607431768211455;
v234 = v109 & 340282366920938463463374607431768211455;
v479 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 16)) = (unsigned int)v206;
*((unsigned int *)(v31 + v452 + 16)) = v479;
v404 = (v479 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v479, (uint128_t)v402)) & 340282366920938463463374607431768211455;
if (v443 + 5 < v409)
{
v233 = v107 & 340282366920938463463374607431768211455;
v234 = v109 & 340282366920938463463374607431768211455;
v480 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 20)) = (unsigned int)v206;
*((unsigned int *)(v31 + v452 + 20)) = v480;
v404 = (v480 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v480, (uint128_t)v402)) & 340282366920938463463374607431768211455;
v235 = (v235 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v206 & 340282366920938463463374607431768211455, AddV((uint128_t)v206 & 340282366920938463463374607431768211455, AddV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v235) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
if (v443 + 6 < v409)
{
v233 = v107 & 340282366920938463463374607431768211455;
v234 = v109 & 340282366920938463463374607431768211455;
v481 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 24)) = (unsigned int)v206;
*((unsigned int *)(v31 + v452 + 24)) = v481;
v404 = (v481 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v481, (uint128_t)v402)) & 340282366920938463463374607431768211455;
if (v443 + 7 < v409)
{
v234 = v109 & 340282366920938463463374607431768211455;
v233 = v107 & 340282366920938463463374607431768211455;
*((unsigned int *)(v454 + v452 + 28)) = (unsigned int)v206;
v482 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(v31 + v452 + 28)) = v482;
v404 = (v482 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MulV((uint128_t)v482, (uint128_t)v402)) & 340282366920938463463374607431768211455;
}
}
}
}
}
}
}
}
}
LABEL_403608:
v483 = v404 & 340282366920938463463374607431768211455;
v484 = v402 & 340282366920938463463374607431768211455;
v485 = v233 & 340282366920938463463374607431768211455;
v486 = v234 & 340282366920938463463374607431768211455;
v487 = v235 & 340282366920938463463374607431768211455;
v46 = 0;
v47 = 0;
v18.allocator<char> const&) (.constprop.0)("complex_fir_scalar");
v24.ScopedTimer(v18, &v46);
v18._M_dispose();
v488 = *((long long *)&v59) - (char *)v27;
v489 = v488 >> 2;
v22.resize(v409);
v82.resize(v409);
if (v33)
{
v490 = 0;
v491 = *((long long *)&v82);
v485 = v107 & 340282366920938463463374607431768211455;
do
{
if (v488)
{
v492 = 0;
do
{
v492 += 1;
v484 = v105 & 340282366920938463463374607431768211455;
} while (v490 >= v492 && v492 < v489);
v483 = v92 & 340282366920938463463374607431768211455;
}
else
{
v483 = v483 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v484 = (v484 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v483 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
}
v486 = (v486 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v483 & 340282366920938463463374607431768211455, v33)) & 340282366920938463463374607431768211455;
*((unsigned int *)(*((long long *)&v81) + v490 * 4)) = (unsigned int)v105;
*((unsigned int *)((char *)v491 + 0x4 * v490)) = (unsigned int)v206;
v490 += 1;
} while (v490 < v409);
}
v18.allocator<char> const&) (.constprop.0)("complex_fir_avx");
v24.ScopedTimer(v18, &v47);
v18._M_dispose();
v17.resize(v409);
v84.resize(v409);
if (v11 != v407)
{
v493 = 1;
v494 = *((long long *)&v84);
v495 = 0;
do
{
if (v488 > 28 && v493 > 7)
{
v496 = v484 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v497 = 8;
v498 = v493;
while (true)
{
v499 = v498;
if (v409 >= v499)
v487 = *((int256_t *)(-32 + (char *)v407 + 4 * v499));
v500 = v497 + 8;
v501 = v496;
if (v489 < v500 || !((v501 = v496, v493 >= v500)))
break;
v497 = v500;
v498 = v499 - 8;
}
}
else
{
v496 = v484 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v497 = 0;
v501 = v496;
}
v503 = (v501 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v501 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v504 = ((v483 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v501 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v483 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v501 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v503) & 340282366920938463463374607431768211455;
v505 = (v503 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v504 >> 96) CONCAT (unsigned int)((uint128_t)v504 >> 96) CONCAT (unsigned int)((unsigned long long)v504 >> 32) CONCAT (unsigned int)((unsigned long long)v504 >> 32))) & 340282366920938463463374607431768211455;
v506 = ((v505 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v505 >> 64) CONCAT (unsigned long long)(((v504 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v504 + v505) & 340282366920938463463374607431768211455) >> 64))) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint128_t)v496 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v507 = v496 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 340282366920938463463374607431768211455;
v508 = (v506 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v506 + v507) & 340282366920938463463374607431768211455;
v509 = (v507 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v508 >> 96) CONCAT (unsigned int)((uint128_t)v508 >> 96) CONCAT (unsigned int)((unsigned long long)v508 >> 32) CONCAT (unsigned int)((unsigned long long)v508 >> 32))) & 340282366920938463463374607431768211455;
v484 = (v509 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned long long)(v509 >> 64) CONCAT (unsigned long long)(((v508 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v508 + v509) & 340282366920938463463374607431768211455) >> 64))) & 340282366920938463463374607431768211455;
if (v495 >= v497 && v497 < v489)
{
do
{
v484 = v105 & 340282366920938463463374607431768211455;
v497 += 1;
} while (v495 >= v497 && v497 < v489);
}
*((unsigned int *)(*((long long *)&v83) + v495 * 4)) = (unsigned int)v92;
v493 += 1;
*((unsigned int *)((char *)v494 + 0x4 * v495)) = (unsigned int)v206;
v495 += 1;
v483 = v92 & 340282366920938463463374607431768211455;
v487 = (v487 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v483, v32)) & 340282366920938463463374607431768211455;
} while (v495 < v409);
v483 &= 340282366920938463463374607431768211455;
v484 &= 340282366920938463463374607431768211455;
v485 = v107 & 340282366920938463463374607431768211455;
v486 &= 340282366920938463463374607431768211455;
v487 &= 340282366920938463463374607431768211455;
}
g_408040.char_traits<char>>("Complex mul scalar: checksum=", 0x1d, v495);
v510 = v483 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v30;
v511 = v510 >> 64;
v512 = g_408040._M_insert<double>((unsigned long long)v510 | v511 * 0);
v512.char_traits<char>>(" time=", 0x7, v181);
v512._M_insert<double>(v44).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Complex mul AVX : checksum=", 0x1d, v181);
v514 = ((v510 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v511 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v44) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v34;
v515 = v514 >> 64;
v516 = g_408040._M_insert<double>((unsigned long long)v514 | v515 * 0);
v516.char_traits<char>>(" time=", 0x7, v181);
v516._M_insert<double>(v45).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("FIR scalar : checksum=", 0x1d, v181);
v518 = ((v514 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v515 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v45) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v33;
v519 = v518 >> 64;
v520 = g_408040._M_insert<double>((unsigned long long)v518 | v519 * 0);
v520.char_traits<char>>(" time=", 0x7, v181);
v520._M_insert<double>(v46).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("FIR AVX : checksum=", 0x1d, v181);
v522 = ((v518 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v519 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v46) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v32;
v523 = v522 >> 64;
v524 = g_408040._M_insert<double>((unsigned long long)v522 | v523 * 0);
v524.char_traits<char>>(" time=", 0x7, v181);
v524._M_insert<double>(v47).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Delta cmul checksum (AVX - scalar): ", 0x24, v181);
v526 = (((v522 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v523 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v47) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(SubV(v96 & 340282366920938463463374607431768211455, v30))) & 340282366920938463463374607431768211455;
v527 = v526 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v526 & 18446744073709551615;
g_408040._M_insert<double>((unsigned long long)v527 | (unsigned long long)(v527 >> 64) * 0).char_traits<char>>&, char const*) (.isra.0)("\n");
g_408040.char_traits<char>>("Delta FIR checksum (AVX - scalar): ", 0x24, v181);
v529 = ((v527 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | (v527 >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | SubV((uint128_t)v96 & 340282366920938463463374607431768211455, v33)) & 340282366920938463463374607431768211455;
v530 = ((v529 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v529 & 18446744073709551615) & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | ((v529 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v529 & 18446744073709551615) >> 64 & 18446744073709551615) * 0x10000000000000000) & 340282366920938463463374607431768211455;
g_408040._M_insert<double>((unsigned long long)v530).char_traits<char>>&, char const*) (.isra.0)("\n");
g_408040.char_traits<char>>&, char const*) (.isra.0)("--------------------------------------------------------\n\n");
g_408040.char_traits<char>>&, char const*) (.isra.0)("=== Workload 4: Soft clip / limiter on FIR output ===\n");
v48 = 0;
v49 = 0;
v60.vector(v22);
v62.vector(&v60);
v18.allocator<char> const&) (.constprop.0)("soft_clip_scalar");
v24.ScopedTimer(v18, &v48);
v18._M_dispose();
v532 = v60;
v533 = *((long long *)&v61) - v532;
v534 = v533 >> 2;
if (*((long long *)&v61) != v532)
{
v535 = (v533 ? v534 : 1);
if (v533 > 28)
{
v536 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v537 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v538 = v532;
do
{
v539 = *((int256_t *)v538);
v540 = v538 + 32;
v541 = CmpLTV(UnaryOp unpack, UnaryOp unpack) CONCAT CmpLTV(UnaryOp unpack, UnaryOp unpack);
v542 = (v541 | (CmpLTV(UnaryOp unpack, UnaryOp unpack) CONCAT CmpLTV(UnaryOp unpack, UnaryOp unpack))) ^ CmpEQV(v485, v485);
v543 = v537 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | (v539 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | v536 & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31))) & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31));
v544 = (v487 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v543 >> 32) CONCAT (unsigned int)((unsigned long long)v543 >> 32) CONCAT (unsigned int)((unsigned long long)v543 >> 32) CONCAT (unsigned int)((unsigned long long)v543 >> 32))) & 340282366920938463463374607431768211455;
v545 = (v542 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96))) & 340282366920938463463374607431768211455;
*((uint256_t *)&v540[32]) = v543;
v546 = (v109 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v109 & 340282366920938463463374607431768211455, (uint128_t)v543)) & 340282366920938463463374607431768211455;
v487 = (v544 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v544, (uint128_t)v546)) & 340282366920938463463374607431768211455;
v547 = (v546 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 96) CONCAT (unsigned int)((uint128_t)v543 >> 64) CONCAT (unsigned int)((uint128_t)v543 >> 64))) & 340282366920938463463374607431768211455;
v548 = (v543 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v543 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v549 = (v547 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v547, (uint128_t)v487)) & 340282366920938463463374607431768211455;
v550 = (v545 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v545, (uint128_t)v549)) & 340282366920938463463374607431768211455;
v551 = (v549 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((unsigned long long)v548 >> 32) CONCAT (unsigned int)((unsigned long long)v548 >> 32) CONCAT (unsigned int)((unsigned long long)v548 >> 32) CONCAT (unsigned int)((unsigned long long)v548 >> 32))) & 340282366920938463463374607431768211455;
v552 = (v550 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v548, (uint128_t)v550)) & 340282366920938463463374607431768211455;
v553 = (v551 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v551, (uint128_t)v552)) & 340282366920938463463374607431768211455;
v554 = (v552 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 64) CONCAT (unsigned int)((uint128_t)v548 >> 64))) & 340282366920938463463374607431768211455;
v556 = (v553 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV(AddV((uint128_t)v554, (uint128_t)v553) & 340282366920938463463374607431768211455, ((unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96) CONCAT (unsigned int)((uint128_t)v548 >> 96)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
v538 = v540;
} while (v538 != (v535 >> 3) * 32 + v532);
v557 = v535 & 18446744073709551608;
if (((char)v535 & 7))
{
v530 = v555 & 340282366920938463463374607431768211455;
v484 = v537 & 340282366920938463463374607431768211455;
v486 = v556 & 340282366920938463463374607431768211455;
LABEL_403d41:
v558 = v535 - v557;
if (v558 - 1 > 2)
{
v559 = v532 + v557 * 4;
v486 = (v486 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v560 = (1061997773 CONCAT 1061997773 CONCAT 1061997773 CONCAT 1061997773) & 340282366920938463463374607431768211455;
v561 = (v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (3209481421 CONCAT 3209481421 CONCAT 3209481421 CONCAT 3209481421)) & 340282366920938463463374607431768211455;
v562 = *(v559) & 340282366920938463463374607431768211455;
v563 = CmpLTV(v560, v562) & 340282366920938463463374607431768211455;
v564 = (v561 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(v562 & SarNV(((v563 | CmpLTV(v562, (uint128_t)v561) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 ^ (uint128_t)v486) & 340282366920938463463374607431768211455, 31) | (uint128_t)v561 & ~(SarNV(((v563 | CmpLTV(v562, (uint128_t)v561) & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 ^ (uint128_t)v486) & 340282366920938463463374607431768211455, 31)))) & 340282366920938463463374607431768211455;
v565 = (v564 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v560 & SarNV(v563, 31) | (uint128_t)v564 & ~(SarNV(v563, 31))) & 340282366920938463463374607431768211455;
*(v559) = v565;
v557 += v558 & 18446744073709551612;
v530 = (v565 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v565 >> 96) CONCAT (unsigned int)((uint128_t)v565 >> 96) CONCAT (unsigned int)((uint128_t)v565 >> 96) CONCAT (unsigned int)((uint128_t)v565 >> 96))) & 340282366920938463463374607431768211455;
if (!((char)v558 & 3))
goto LABEL_403e8f;
}
v566 = v557 * 4;
v567 = v92 & 340282366920938463463374607431768211455;
if (((CmpF((unsigned long long)v567, 1061997773) & 69 | (char)((CmpF((unsigned long long)v567, 1061997773) & 69) >> 6)) & 1) == 1)
v530 = (v567 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MaxV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v567)) & 340282366920938463463374607431768211455;
else
v530 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)((char *)v532 + v566)) = (unsigned int)v92;
if (v557 + 1 < v534)
{
v568 = v92 & 340282366920938463463374607431768211455;
if (((CmpF((unsigned long long)v568, 1061997773) & 69 | (char)((CmpF((unsigned long long)v568, 1061997773) & 69) >> 6)) & 1) == 1)
v530 = (v568 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MaxV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v568)) & 340282366920938463463374607431768211455;
else
v530 = v92 & 340282366920938463463374607431768211455;
*((unsigned int *)(4 + (char *)v532 + v566)) = (unsigned int)v92;
if (v557 + 2 < v534)
{
v569 = v92 & 340282366920938463463374607431768211455;
if (((CmpF((unsigned long long)v569, 1061997773) & 69 | (char)((CmpF((unsigned long long)v569, 1061997773) & 69) >> 6)) & 1) != 1)
v530 = v92 & 340282366920938463463374607431768211455;
else
v530 = (v569 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | MaxV((uint128_t)v206 & 340282366920938463463374607431768211455, (uint128_t)v569)) & 340282366920938463463374607431768211455;
*((unsigned int *)(8 + (char *)v532 + v566)) = (unsigned int)v92;
}
}
}
else
{
v530 = v555 & 340282366920938463463374607431768211455;
v486 = v556 & 340282366920938463463374607431768211455;
}
}
else
{
v557 = 0;
goto LABEL_403d41;
}
}
LABEL_403e8f:
v18.allocator<char> const&) (.constprop.0)("soft_clip_avx");
v24.ScopedTimer(v18, &v49);
v18._M_dispose();
v570 = v62;
v571 = *((long long *)&v63) - (char *)v570;
v572 = v571 >> 2;
if (v571 > 28)
{
v573 = 8;
v574 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v575 = [D] unsupported_<class 'pyvex.expr.Qop'>();
while (true)
{
v576 = v573 + 8;
v577 = MaxV(MinV(*((int256_t *)&v570[8 + v573]), v575), v574);
*((void*)&v570[8 + v573]) = v577;
v578 = (v577 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v577 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v579 = ((v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v577 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v577 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455) + v578) & 340282366920938463463374607431768211455;
v580 = (v578 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v579 >> 96) CONCAT (unsigned int)((uint128_t)v579 >> 96) CONCAT (unsigned int)((unsigned long long)v579 >> 32) CONCAT (unsigned int)((unsigned long long)v579 >> 32))) & 340282366920938463463374607431768211455;
v530 = ((v579 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v579 + v580) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | AddV((uint128_t)v579 + (uint128_t)v580 & 340282366920938463463374607431768211455, ((unsigned long long)(v580 >> 64) CONCAT (unsigned long long)(((v579 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v579 + v580) & 340282366920938463463374607431768211455) >> 64)) & 340282366920938463463374607431768211455)) & 340282366920938463463374607431768211455;
if (v572 < v576)
break;
v573 = v576;
}
}
else
{
v573 = 0;
}
if (v573 < v572)
{
v581 = v572 - v573;
if (v581 - 1 > 6)
{
v582 = &v570[v573];
v583 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v584 = [D] unsupported_<class 'pyvex.expr.Qop'>();
v585 = *(v582);
v586 = v583 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | (v585 & (SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31)) | v584 & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31))) & ~(SarNV(UnaryOp unpack, 31) CONCAT SarNV(UnaryOp unpack, 31));
*(v582) = v586;
v587 = v581 & 18446744073709551608;
v588 = (v586 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v586 / 0x100000000000000000000000000000000 & 340282366920938463463374607431768211455) & 340282366920938463463374607431768211455;
v589 = v573 + v587;
v530 = (v588 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v588 >> 96) CONCAT (unsigned int)((uint128_t)v588 >> 96) CONCAT (unsigned int)((uint128_t)v588 >> 96) CONCAT (unsigned int)((uint128_t)v588 >> 96))) & 340282366920938463463374607431768211455;
if (!((char)v581 & 7))
goto LABEL_404178;
}
else
{
v589 = v573;
v587 = 0;
}
v590 = v581 - v587;
if (v590 - 1 > 2)
{
v591 = (1061997773 CONCAT 1061997773 CONCAT 1061997773 CONCAT 1061997773) & 340282366920938463463374607431768211455;
v592 = (v530 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (3209481421 CONCAT 3209481421 CONCAT 3209481421 CONCAT 3209481421)) & 340282366920938463463374607431768211455;
v593 = &v570[v587 + v573];
v594 = *(v593) & 340282366920938463463374607431768211455;
v589 += v590 & 18446744073709551612;
v595 = CmpLTV(v591, v594) & 340282366920938463463374607431768211455;
v596 = ...;
v597 = (v596 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | v591 & SarNV(v595, 31) | (uint128_t)v596 & ~(SarNV(v595, 31))) & 340282366920938463463374607431768211455;
*(v593) = v597;
v530 = (v597 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | ((unsigned int)((uint128_t)v597 >> 96) CONCAT (unsigned int)((uint128_t)v597 >> 96) CONCAT (unsigned int)((uint128_t)v597 >> 96) CONCAT (unsigned int)((uint128_t)v597 >> 96))) & 340282366920938463463374607431768211455;
if (!((unsigned int)v590 & 3))
goto LABEL_404178;
}
v598 = v589 * 4;
v599 = v92 & 340282366920938463463374607431768211455;
v530 = (((CmpF((unsigned long long)v599, 1061997773) & 69 | (CmpF((unsigned long long)v599, 1061997773) & 69) >> 6) & 1) == 1 ? (v599 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(MaxV(v206 & 340282366920938463463374607431768211455, (uint128_t)v599))) & 340282366920938463463374607431768211455 : v92 & 340282366920938463463374607431768211455);
*((unsigned int *)(v570 + v598)) = (unsigned int)v92;
if (v589 + 1 < v572)
{
v600 = v92 & 340282366920938463463374607431768211455;
v530 = (((CmpF((unsigned long long)v600, 1061997773) & 69 | (CmpF((unsigned long long)v600, 1061997773) & 69) >> 6) & 1) == 1 ? (v600 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(MaxV(v206 & 340282366920938463463374607431768211455, (uint128_t)v600))) & 340282366920938463463374607431768211455 : v92 & 340282366920938463463374607431768211455);
*((unsigned int *)(v570 + v598 + 4)) = (unsigned int)v92;
if (v589 + 2 < v572)
{
v601 = v92 & 340282366920938463463374607431768211455;
v530 = (((CmpF((unsigned long long)v601, 1061997773) & 69 | (CmpF((unsigned long long)v601, 1061997773) & 69) >> 6) & 1) == 1 ? (v601 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(MaxV(v206 & 340282366920938463463374607431768211455, (uint128_t)v601))) & 340282366920938463463374607431768211455 : v92 & 340282366920938463463374607431768211455);
*((unsigned int *)(v570 + v598 + 8)) = (unsigned int)v92;
}
}
}
LABEL_404178:
g_408040.char_traits<char>>("Soft clip scalar: checksum=", 0x1b, v570);
v602 = v530 & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v33;
v603 = v602 >> 64;
v604 = g_408040._M_insert<double>((unsigned long long)v602 | v603 * 0);
v604.char_traits<char>>(" time=", 0x7, v181);
v604._M_insert<double>(v48).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Soft clip AVX : checksum=", 0x1b, v181);
v606 = ((v602 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v603 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v48) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v34;
v607 = v606 >> 64;
v608 = g_408040._M_insert<double>((unsigned long long)v606 | v607 * 0);
v608.char_traits<char>>(" time=", 0x7, v181);
v608._M_insert<double>(v49).char_traits<char>>&, char const*) (.isra.0)(" ms\n");
g_408040.char_traits<char>>("Delta clip checksum (AVX - scalar): ", 0x24, v181);
v610 = (((v606 & 115792089237316195423570985008687907852929702298719625576012656144555070980095 | v607 * 0x10000000000000000) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | (uint256_t)v49) & 340282366920938463463374607431768211455 & 0xffffffffffffffffffffffffffffffff00000000000000000000000000000000 | (uint256_t)(SubV(v96 & 340282366920938463463374607431768211455, v33))) & 340282366920938463463374607431768211455;
g_408040._M_insert<double>((unsigned long long)v610 | (unsigned long long)((v610 & 0xffffffffffffffffffffffffffffffffffffffffffffffff0000000000000000 | v610 & 18446744073709551615) >> 64) * 0).char_traits<char>>&, char const*) (.isra.0)("\n");
g_408040.char_traits<char>>&, char const*) (.isra.0)("\nDone.\n");
return 0;
}
#include <immintrin.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <complex>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <random>
#include <string>
#include <vector>
using f32 = float;
using f64 = double;
using u32 = std::uint32_t;
using u64 = std::uint64_t;
constexpr std::size_t N_SAXPY = 0x400000u; // 4,194,304 elements
constexpr int WIDTH = 1920;
constexpr int HEIGHT = 1080;
constexpr std::size_t N_PIXELS = static_cast<std::size_t>(WIDTH) * HEIGHT; // 2,073,600
constexpr std::size_t N_COMPLEX = 0x40000u; // 262,144 elements
constexpr int FIR_TAPS = 16;
// -----------------------------------------------------------------------------
// ScopedTimer: RAII timing helper writing elapsed time [ms] to a referenced slot
// -----------------------------------------------------------------------------
struct ScopedTimer
{
using clock = std::chrono::high_resolution_clock;
std::string label;
double &out_ms;
clock::time_point t0;
ScopedTimer(const std::string &name, double &target)
: label(name), out_ms(target), t0(clock::now())
{
}
~ScopedTimer()
{
const auto t1 = clock::now();
out_ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
}
};
// -----------------------------------------------------------------------------
// ComplexSoA: separate real/imaginary storage for AVX-friendly layout
// -----------------------------------------------------------------------------
struct ComplexSoA
{
std::vector<f32> re;
std::vector<f32> im;
explicit ComplexSoA(std::size_t n = 0) : re(n), im(n) {}
std::size_t size() const { return re.size(); }
};
// -----------------------------------------------------------------------------
// Checksums
// -----------------------------------------------------------------------------
static double checksum_real(const std::vector<f32> &v)
{
double sum = 0.0;
for (f32 x : v)
sum += static_cast<double>(x);
return sum;
}
static double checksum_complex(const ComplexSoA &c)
{
double sum = 0.0;
const std::size_t n = c.size();
for (std::size_t i = 0; i < n; ++i)
{
sum += static_cast<double>(c.re[i]) +
static_cast<double>(c.im[i]);
}
return sum;
}
// -----------------------------------------------------------------------------
// Initialisation helpers
// -----------------------------------------------------------------------------
static void init_saxpy_vectors(std::vector<f32> &x,
std::vector<f32> &y)
{
const std::size_t n = x.size();
std::mt19937 rng(0x1234abcdU);
std::uniform_real_distribution<f32> dist(-1.0f, 1.0f);
for (std::size_t i = 0; i < n; ++i)
{
x[i] = dist(rng);
y[i] = dist(rng);
}
}
static void init_image(std::vector<f32> &img,
int width,
int height)
{
const std::size_t n = static_cast<std::size_t>(width) * height;
std::mt19937 rng(0x9876fedcU);
std::uniform_real_distribution<f32> dist(0.0f, 1.0f);
for (std::size_t i = 0; i < n; ++i)
img[i] = dist(rng);
}
static void fill_complex(ComplexSoA &c, u32 seed)
{
const std::size_t n = c.size();
std::mt19937 rng(seed);
std::uniform_real_distribution<f32> dist(-1.0f, 1.0f);
for (std::size_t i = 0; i < n; ++i)
{
c.re[i] = dist(rng);
c.im[i] = dist(rng);
}
}
// FIR kernel: 16-tap Gaussian-shaped low-pass, normalised to sum=1
static std::vector<f32> make_fir_kernel()
{
std::vector<f32> h(FIR_TAPS);
const float center = 7.5f; // matches (i - 7.5) in decompiled constants
const float scale = 0.125f; // 0x3e000000
const float pi = 3.1415927410f; // 0x40490fdb
for (int i = 0; i < FIR_TAPS; ++i)
{
const float x = (static_cast<float>(i) - center) * scale;
const float g = std::exp(-x * x);
h[i] = g;
// Extra trigonometric path used only to exercise exp/cos-like work;
// result not stored.
const float phase = (x + 0.5f) * pi * 0.0625f; // 0x3f000000 * pi * 0x3d800000
(void)std::cos(phase);
}
float sum = 0.0f;
for (float v : h)
sum += v;
if (sum != 0.0f)
{
const float inv = 1.0f / sum;
for (float &v : h)
v *= inv;
}
return h;
}
// -----------------------------------------------------------------------------
// Workload 1: SAXPY + cosine similarity
// -----------------------------------------------------------------------------
static double saxpy_scalar(std::vector<f32> &y,
const std::vector<f32> &x,
f32 a)
{
const std::size_t n = y.size();
double sum = 0.0;
for (std::size_t i = 0; i < n; ++i)
{
const float yi = a * x[i] + y[i];
y[i] = yi;
sum += static_cast<double>(yi);
}
return sum;
}
static double saxpy_avx(std::vector<f32> &y,
const std::vector<f32> &x,
f32 a)
{
const std::size_t n = y.size();
const float *px = x.data();
float *py = y.data();
__m256 va = _mm256_set1_ps(a);
__m256 vacc = _mm256_setzero_ps();
std::size_t i = 0;
alignas(32) float tmp[8];
for (; i + 8 <= n; i += 8)
{
__m256 vx = _mm256_loadu_ps(px + i);
__m256 vy = _mm256_loadu_ps(py + i);
__m256 vz = _mm256_add_ps(_mm256_mul_ps(va, vx), vy);
_mm256_storeu_ps(py + i, vz);
vacc = _mm256_add_ps(vacc, vz);
}
_mm256_storeu_ps(tmp, vacc);
double sum = 0.0;
for (int j = 0; j < 8; ++j)
sum += static_cast<double>(tmp[j]);
for (; i < n; ++i)
{
const float yi = a * px[i] + py[i];
py[i] = yi;
sum += static_cast<double>(yi);
}
return sum;
}
static double cosine_scalar(const std::vector<f32> &x,
const std::vector<f32> &y)
{
const std::size_t n = x.size();
double dot = 0.0;
double nx2 = 0.0;
double ny2 = 0.0;
for (std::size_t i = 0; i < n; ++i)
{
const double xi = x[i];
const double yi = y[i];
dot += xi * yi;
nx2 += xi * xi;
ny2 += yi * yi;
}
const double denom = std::sqrt(nx2) * std::sqrt(ny2);
if (denom == 0.0)
return 0.0;
return dot / denom;
}
static double cosine_avx(const std::vector<f32> &x,
const std::vector<f32> &y)
{
const std::size_t n = x.size();
const float *px = x.data();
const float *py = y.data();
__m256 vdot = _mm256_setzero_ps();
__m256 vnx2 = _mm256_setzero_ps();
__m256 vny2 = _mm256_setzero_ps();
std::size_t i = 0;
alignas(32) float tmp_dot[8];
alignas(32) float tmp_nx2[8];
alignas(32) float tmp_ny2[8];
for (; i + 8 <= n; i += 8)
{
__m256 vx = _mm256_loadu_ps(px + i);
__m256 vy = _mm256_loadu_ps(py + i);
__m256 vx2 = _mm256_mul_ps(vx, vx);
__m256 vy2 = _mm256_mul_ps(vy, vy);
__m256 prod = _mm256_mul_ps(vx, vy);
vdot = _mm256_add_ps(vdot, prod);
vnx2 = _mm256_add_ps(vnx2, vx2);
vny2 = _mm256_add_ps(vny2, vy2);
}
_mm256_storeu_ps(tmp_dot, vdot);
_mm256_storeu_ps(tmp_nx2, vnx2);
_mm256_storeu_ps(tmp_ny2, vny2);
double dot = 0.0;
double nx2 = 0.0;
double ny2 = 0.0;
for (int j = 0; j < 8; ++j)
{
dot += static_cast<double>(tmp_dot[j]);
nx2 += static_cast<double>(tmp_nx2[j]);
ny2 += static_cast<double>(tmp_ny2[j]);
}
for (; i < n; ++i)
{
const double xi = px[i];
const double yi = py[i];
dot += xi * yi;
nx2 += xi * xi;
ny2 += yi * yi;
}
const double denom = std::sqrt(nx2) * std::sqrt(ny2);
if (denom == 0.0)
return 0.0;
return dot / denom;
}
// -----------------------------------------------------------------------------
// Workload 2: 2D 5-point blur on 1080p image
// -----------------------------------------------------------------------------
static double blur5_scalar(const std::vector<f32> &src,
std::vector<f32> &dst,
int width,
int height)
{
const std::size_t w = static_cast<std::size_t>(width);
const std::size_t h = static_cast<std::size_t>(height);
// Copy borders
for (std::size_t x = 0; x < w; ++x)
{
dst[x] = src[x];
dst[(h - 1) * w + x] = src[(h - 1) * w + x];
}
for (std::size_t y = 1; y + 1 < h; ++y)
{
dst[y * w] = src[y * w];
dst[y * w + (w - 1)] = src[y * w + (w - 1)];
}
// Interior 5-point stencil
const float scale = 0.2f;
for (std::size_t y = 1; y + 1 < h; ++y)
{
for (std::size_t x = 1; x + 1 < w; ++x)
{
const std::size_t idx = y * w + x;
const float c = src[idx];
const float up = src[idx - w];
const float dn = src[idx + w];
const float lf = src[idx - 1];
const float rt = src[idx + 1];
dst[idx] = scale * (c + up + dn + lf + rt);
}
}
return checksum_real(dst);
}
static double blur5_avx(const std::vector<f32> &src,
std::vector<f32> &dst,
int width,
int height)
{
const std::size_t w = static_cast<std::size_t>(width);
const std::size_t h = static_cast<std::size_t>(height);
const float *ps = src.data();
float *pd = dst.data();
// Copy borders (scalar)
for (std::size_t x = 0; x < w; ++x)
{
pd[x] = ps[x];
pd[(h - 1) * w + x] = ps[(h - 1) * w + x];
}
for (std::size_t y = 1; y + 1 < h; ++y)
{
pd[y * w] = ps[y * w];
pd[y * w + (w - 1)] = ps[y * w + (w - 1)];
}
const __m256 vscale = _mm256_set1_ps(0.2f);
for (std::size_t y = 1; y + 1 < h; ++y)
{
std::size_t x = 1;
for (; x + 7 < w - 1; x += 8)
{
const std::size_t idx = y * w + x;
__m256 c = _mm256_loadu_ps(ps + idx);
__m256 up = _mm256_loadu_ps(ps + idx - w);
__m256 dn = _mm256_loadu_ps(ps + idx + w);
__m256 lf = _mm256_loadu_ps(ps + idx - 1);
__m256 rt = _mm256_loadu_ps(ps + idx + 1);
__m256 sum1 = _mm256_add_ps(c, up);
__m256 sum2 = _mm256_add_ps(dn, lf);
__m256 sum = _mm256_add_ps(_mm256_add_ps(sum1, sum2), rt);
__m256 out = _mm256_mul_ps(sum, vscale);
_mm256_storeu_ps(pd + idx, out);
}
// Tail
for (; x + 1 < w; ++x)
{
const std::size_t idx = y * w + x;
const float c = ps[idx];
const float up = ps[idx - w];
const float dn = ps[idx + w];
const float lf = ps[idx - 1];
const float rt = ps[idx + 1];
pd[idx] = 0.2f * (c + up + dn + lf + rt);
}
}
return checksum_real(dst);
}
// -----------------------------------------------------------------------------
// Workload 3: Complex multiply + FIR convolution
// -----------------------------------------------------------------------------
static double complex_mul_scalar(const ComplexSoA &a,
const ComplexSoA &b,
ComplexSoA &out)
{
const std::size_t n = a.size();
double sum = 0.0;
for (std::size_t i = 0; i < n; ++i)
{
const float ar = a.re[i];
const float ai = a.im[i];
const float br = b.re[i];
const float bi = b.im[i];
const float cr = ar * br - ai * bi;
const float ci = ar * bi + ai * br;
out.re[i] = cr;
out.im[i] = ci;
sum += static_cast<double>(cr) + static_cast<double>(ci);
}
return sum;
}
static double complex_mul_avx(const ComplexSoA &a,
const ComplexSoA &b,
ComplexSoA &out)
{
const std::size_t n = a.size();
const float *ar = a.re.data();
const float *ai = a.im.data();
const float *br = b.re.data();
const float *bi = b.im.data();
float *or_ = out.re.data();
float *oi = out.im.data();
__m256 acc_re = _mm256_setzero_ps();
__m256 acc_im = _mm256_setzero_ps();
std::size_t i = 0;
alignas(32) float tmp_re[8];
alignas(32) float tmp_im[8];
for (; i + 8 <= n; i += 8)
{
__m256 ar_v = _mm256_loadu_ps(ar + i);
__m256 ai_v = _mm256_loadu_ps(ai + i);
__m256 br_v = _mm256_loadu_ps(br + i);
__m256 bi_v = _mm256_loadu_ps(bi + i);
__m256 arbr = _mm256_mul_ps(ar_v, br_v);
__m256 aibi = _mm256_mul_ps(ai_v, bi_v);
__m256 arbi = _mm256_mul_ps(ar_v, bi_v);
__m256 aibr = _mm256_mul_ps(ai_v, br_v);
__m256 cr = _mm256_sub_ps(arbr, aibi);
__m256 ci = _mm256_add_ps(arbi, aibr);
_mm256_storeu_ps(or_ + i, cr);
_mm256_storeu_ps(oi + i, ci);
acc_re = _mm256_add_ps(acc_re, cr);
acc_im = _mm256_add_ps(acc_im, ci);
}
_mm256_storeu_ps(tmp_re, acc_re);
_mm256_storeu_ps(tmp_im, acc_im);
double sum = 0.0;
for (int j = 0; j < 8; ++j)
sum += static_cast<double>(tmp_re[j]) + static_cast<double>(tmp_im[j]);
for (; i < n; ++i)
{
const float ar_ = ar[i];
const float ai_ = ai[i];
const float br_ = br[i];
const float bi_ = bi[i];
const float cr = ar_ * br_ - ai_ * bi_;
const float ci = ar_ * bi_ + ai_ * br_;
or_[i] = cr;
oi[i] = ci;
sum += static_cast<double>(cr) + static_cast<double>(ci);
}
return sum;
}
static double complex_fir_scalar(const ComplexSoA &in,
const std::vector<f32> &h,
ComplexSoA &out)
{
const std::size_t n = in.size();
const int taps = static_cast<int>(h.size());
double sum = 0.0;
for (std::size_t i = 0; i < n; ++i)
{
float acc_re = 0.0f;
float acc_im = 0.0f;
const std::size_t limit = (i + 1 < static_cast<std::size_t>(taps))
? (i + 1)
: static_cast<std::size_t>(taps);
for (std::size_t k = 0; k < limit; ++k)
{
const float coeff = h[static_cast<int>(k)];
const std::size_t idx = i - k;
acc_re += coeff * in.re[idx];
acc_im += coeff * in.im[idx];
}
out.re[i] = acc_re;
out.im[i] = acc_im;
sum += static_cast<double>(acc_re) + static_cast<double>(acc_im);
}
return sum;
}
static double complex_fir_avx(const ComplexSoA &in,
const std::vector<f32> &h,
ComplexSoA &out)
{
const std::size_t n = in.size();
const int taps = static_cast<int>(h.size());
const float *re = in.re.data();
const float *im = in.im.data();
float *ore = out.re.data();
float *oim = out.im.data();
double sum = 0.0;
// Head region: scalar (insufficient history for a full vector block)
const std::size_t start = static_cast<std::size_t>(taps - 1);
for (std::size_t i = 0; i < std::min(start, n); ++i)
{
float acc_re = 0.0f;
float acc_im = 0.0f;
const std::size_t limit = (i + 1 < static_cast<std::size_t>(taps))
? (i + 1)
: static_cast<std::size_t>(taps);
for (std::size_t k = 0; k < limit; ++k)
{
const float coeff = h[static_cast<int>(k)];
const std::size_t idx = i - k;
acc_re += coeff * re[idx];
acc_im += coeff * im[idx];
}
ore[i] = acc_re;
oim[i] = acc_im;
sum += static_cast<double>(acc_re) + static_cast<double>(acc_im);
}
// Vectorised interior
std::size_t i = start;
alignas(32) float tmp_re[8];
alignas(32) float tmp_im[8];
for (; i + 8 <= n; i += 8)
{
__m256 acc_re = _mm256_setzero_ps();
__m256 acc_im = _mm256_setzero_ps();
for (int k = 0; k < taps; ++k)
{
const float coeff = h[k];
const float *pre = re + i - k;
const float *pim = im + i - k;
__m256 vcoeff = _mm256_set1_ps(coeff);
__m256 vre = _mm256_loadu_ps(pre);
__m256 vim = _mm256_loadu_ps(pim);
acc_re = _mm256_add_ps(acc_re, _mm256_mul_ps(vcoeff, vre));
acc_im = _mm256_add_ps(acc_im, _mm256_mul_ps(vcoeff, vim));
}
_mm256_storeu_ps(ore + i, acc_re);
_mm256_storeu_ps(oim + i, acc_im);
_mm256_storeu_ps(tmp_re, acc_re);
_mm256_storeu_ps(tmp_im, acc_im);
for (int j = 0; j < 8; ++j)
sum += static_cast<double>(tmp_re[j]) + static_cast<double>(tmp_im[j]);
}
// Tail region: scalar
for (; i < n; ++i)
{
float acc_re = 0.0f;
float acc_im = 0.0f;
for (int k = 0; k < taps; ++k)
{
if (i < static_cast<std::size_t>(k))
break;
const std::size_t idx = i - static_cast<std::size_t>(k);
const float coeff = h[k];
acc_re += coeff * re[idx];
acc_im += coeff * im[idx];
}
ore[i] = acc_re;
oim[i] = acc_im;
sum += static_cast<double>(acc_re) + static_cast<double>(acc_im);
}
return sum;
}
// -----------------------------------------------------------------------------
// Workload 4: Soft clip / limiter on FIR output
// -----------------------------------------------------------------------------
static inline float soft_clip_scalar_sample(float x, float threshold)
{
const float t = threshold;
if (x <= -t)
return -t;
if (x >= t)
return t;
const float x2 = x * x;
const float x3 = x2 * x;
const float t2 = t * t;
return x - x3 / t2;
}
static double soft_clip_scalar(const std::vector<f32> &in,
std::vector<f32> &out,
float threshold)
{
const std::size_t n = in.size();
double sum = 0.0;
for (std::size_t i = 0; i < n; ++i)
{
const float y = soft_clip_scalar_sample(in[i], threshold);
out[i] = y;
sum += static_cast<double>(y);
}
return sum;
}
static double soft_clip_avx(const std::vector<f32> &in,
std::vector<f32> &out,
float threshold)
{
const std::size_t n = in.size();
const float *pin = in.data();
float *pout = out.data();
const __m256 vth = _mm256_set1_ps(threshold);
const __m256 vmin = _mm256_sub_ps(_mm256_setzero_ps(), vth); // -threshold
const __m256 vt2 = _mm256_set1_ps(threshold * threshold);
__m256 vacc = _mm256_setzero_ps();
std::size_t i = 0;
alignas(32) float tmp[8];
for (; i + 8 <= n; i += 8)
{
__m256 x = _mm256_loadu_ps(pin + i);
__m256 x1 = _mm256_min_ps(_mm256_max_ps(x, vmin), vth); // clamped
__m256 x2 = _mm256_mul_ps(x1, x1);
__m256 x3 = _mm256_mul_ps(x2, x1);
__m256 frac = _mm256_div_ps(x3, vt2);
__m256 y = _mm256_sub_ps(x1, frac);
_mm256_storeu_ps(pout + i, y);
vacc = _mm256_add_ps(vacc, y);
}
_mm256_storeu_ps(tmp, vacc);
double sum = 0.0;
for (int j = 0; j < 8; ++j)
sum += static_cast<double>(tmp[j]);
for (; i < n; ++i)
{
const float y = soft_clip_scalar_sample(pin[i], threshold);
pout[i] = y;
sum += static_cast<double>(y);
}
return sum;
}
// -----------------------------------------------------------------------------
// main()
// -----------------------------------------------------------------------------
int main()
{
// -------------------------------------------------------------------------
// Workload 1: SAXPY + cosine similarity
// -------------------------------------------------------------------------
std::vector<f32> x(N_SAXPY);
std::vector<f32> y_scalar(N_SAXPY);
std::vector<f32> y_avx(N_SAXPY);
init_saxpy_vectors(x, y_scalar);
y_avx = y_scalar;
const float alpha = 0.5f;
double t_saxpy_scalar = 0.0;
double t_saxpy_avx = 0.0;
double t_cos_scalar = 0.0;
double t_cos_avx = 0.0;
double saxpy_scalar_sum = 0.0;
double saxpy_avx_sum = 0.0;
double cos_scalar_val = 0.0;
double cos_avx_val = 0.0;
std::cout << "=== Workload 1: SAXPY + cosine similarity ===\n";
{
ScopedTimer timer("saxpy_scalar", t_saxpy_scalar);
saxpy_scalar_sum = saxpy_scalar(y_scalar, x, alpha);
}
{
ScopedTimer timer("saxpy_avx", t_saxpy_avx);
saxpy_avx_sum = saxpy_avx(y_avx, x, alpha);
}
{
ScopedTimer timer("cosine_scalar", t_cos_scalar);
cos_scalar_val = cosine_scalar(x, y_scalar);
}
{
ScopedTimer timer("cosine_avx", t_cos_avx);
cos_avx_val = cosine_avx(x, y_avx);
}
std::cout << "SAXPY scalar: checksum=" << saxpy_scalar_sum
<< " time=" << t_saxpy_scalar << " ms\n";
std::cout << "SAXPY AVX : checksum=" << saxpy_avx_sum
<< " time=" << t_saxpy_avx << " ms\n";
std::cout << "Cosine scalar: value=" << cos_scalar_val
<< " time=" << t_cos_scalar << " ms\n";
std::cout << "Cosine AVX : value=" << cos_avx_val
<< " time=" << t_cos_avx << " ms\n";
std::cout << "--------------------------------------------------------\n\n";
// -------------------------------------------------------------------------
// Workload 2: 2D 5-point blur on 1080p image
// -------------------------------------------------------------------------
std::vector<f32> img_src(N_PIXELS);
std::vector<f32> img_blur_scalar(N_PIXELS);
std::vector<f32> img_blur_avx(N_PIXELS);
init_image(img_src, WIDTH, HEIGHT);
double t_blur_scalar = 0.0;
double t_blur_avx = 0.0;
double blur_scalar_sum = 0.0;
double blur_avx_sum = 0.0;
std::cout << "=== Workload 2: 2D 5-point blur on 1080p image ===\n";
{
ScopedTimer timer("blur_scalar", t_blur_scalar);
blur_scalar_sum = blur5_scalar(img_src, img_blur_scalar, WIDTH, HEIGHT);
}
{
ScopedTimer timer("blur_avx", t_blur_avx);
blur_avx_sum = blur5_avx(img_src, img_blur_avx, WIDTH, HEIGHT);
}
std::cout << "Blur scalar: checksum=" << blur_scalar_sum
<< " time=" << t_blur_scalar << " ms\n";
std::cout << "Blur AVX : checksum=" << blur_avx_sum
<< " time=" << t_blur_avx << " ms\n";
const double blur_delta = blur_avx_sum - blur_scalar_sum;
std::cout << "Checksum delta (AVX - scalar): " << blur_delta << "\n";
std::cout << "--------------------------------------------------------\n\n";
// -------------------------------------------------------------------------
// Workload 3: Complex multiply + FIR convolution
// -------------------------------------------------------------------------
ComplexSoA a(N_COMPLEX);
ComplexSoA b(N_COMPLEX);
ComplexSoA cmul_scalar(N_COMPLEX);
ComplexSoA cmul_avx(N_COMPLEX);
ComplexSoA fir_scalar(N_COMPLEX);
ComplexSoA fir_avx(N_COMPLEX);
fill_complex(a, 0x1234abcdU);
fill_complex(b, 0x9876fedcU);
const std::vector<f32> fir_kernel = make_fir_kernel();
double t_cmul_scalar = 0.0;
double t_cmul_avx = 0.0;
double t_fir_scalar = 0.0;
double t_fir_avx = 0.0;
double cmul_scalar_sum = 0.0;
double cmul_avx_sum = 0.0;
double fir_scalar_sum = 0.0;
double fir_avx_sum = 0.0;
std::cout << "=== Workload 3: Complex multiply + FIR convolution ===\n";
{
ScopedTimer timer("complex_mul_scalar", t_cmul_scalar);
cmul_scalar_sum = complex_mul_scalar(a, b, cmul_scalar);
}
{
ScopedTimer timer("complex_mul_avx", t_cmul_avx);
cmul_avx_sum = complex_mul_avx(a, b, cmul_avx);
}
{
ScopedTimer timer("complex_fir_scalar", t_fir_scalar);
fir_scalar_sum = complex_fir_scalar(cmul_scalar, fir_kernel, fir_scalar);
}
{
ScopedTimer timer("complex_fir_avx", t_fir_avx);
fir_avx_sum = complex_fir_avx(cmul_avx, fir_kernel, fir_avx);
}
std::cout << "Complex mul scalar: checksum=" << cmul_scalar_sum
<< " time=" << t_cmul_scalar << " ms\n";
std::cout << "Complex mul AVX : checksum=" << cmul_avx_sum
<< " time=" << t_cmul_avx << " ms\n";
std::cout << "FIR scalar : checksum=" << fir_scalar_sum
<< " time=" << t_fir_scalar << " ms\n";
std::cout << "FIR AVX : checksum=" << fir_avx_sum
<< " time=" << t_fir_avx << " ms\n";
const double cmul_delta = cmul_avx_sum - cmul_scalar_sum;
const double fir_delta = fir_avx_sum - fir_scalar_sum;
std::cout << "Delta cmul checksum (AVX - scalar): " << cmul_delta << "\n";
std::cout << "Delta FIR checksum (AVX - scalar): " << fir_delta << "\n";
std::cout << "--------------------------------------------------------\n\n";
// -------------------------------------------------------------------------
// Workload 4: Soft clip / limiter on FIR output
// -------------------------------------------------------------------------
std::vector<f32> clip_in_scalar(N_COMPLEX);
std::vector<f32> clip_in_avx(N_COMPLEX);
std::vector<f32> clip_out_scalar(N_COMPLEX);
std::vector<f32> clip_out_avx(N_COMPLEX);
// Use magnitude of FIR output as input to limiter
for (std::size_t i = 0; i < N_COMPLEX; ++i)
{
const float rs = fir_scalar.re[i];
const float is = fir_scalar.im[i];
const float ra = fir_avx.re[i];
const float ia = fir_avx.im[i];
clip_in_scalar[i] = std::sqrt(rs * rs + is * is);
clip_in_avx[i] = std::sqrt(ra * ra + ia * ia);
}
const float clip_threshold = 1.0f;
double t_clip_scalar = 0.0;
double t_clip_avx = 0.0;
double clip_scalar_sum = 0.0;
double clip_avx_sum = 0.0;
std::cout << "=== Workload 4: Soft clip / limiter on FIR output ===\n";
{
ScopedTimer timer("soft_clip_scalar", t_clip_scalar);
clip_scalar_sum = soft_clip_scalar(clip_in_scalar, clip_out_scalar, clip_threshold);
}
{
ScopedTimer timer("soft_clip_avx", t_clip_avx);
clip_avx_sum = soft_clip_avx(clip_in_avx, clip_out_avx, clip_threshold);
}
std::cout << "Soft clip scalar: checksum=" << clip_scalar_sum
<< " time=" << t_clip_scalar << " ms\n";
std::cout << "Soft clip AVX : checksum=" << clip_avx_sum
<< " time=" << t_clip_avx << " ms\n";
const double clip_delta = clip_avx_sum - clip_scalar_sum;
std::cout << "Delta clip checksum (AVX - scalar): " << clip_delta << "\n";
std::cout << "\nDone.\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment