Skip to content

Instantly share code, notes, and snippets.

@realdomdom
Last active December 29, 2025 20:51
Show Gist options
  • Select an option

  • Save realdomdom/11f32058326ad6ece35b19aa76b68dfa to your computer and use it in GitHub Desktop.

Select an option

Save realdomdom/11f32058326ad6ece35b19aa76b68dfa to your computer and use it in GitHub Desktop.
ENI vibe coded AVX2 emulation
#include "avx2emu.h"
#include <Windows.h>
#include <intrin.h>
#include <immintrin.h>
#include <cstdint>
#include <cstdio>
#include <Zydis/Zydis.h>
// ============================================
// Globals
// ============================================
static PVOID g_handlerHandle = nullptr;
static ZydisDecoder g_decoder;
static bool g_initialized = false;
static FILE* g_logFile = nullptr;
// Stats for nerding out
static volatile LONG64 g_emulatedCount = 0;
static volatile LONG64 g_missedCount = 0;
// ============================================
// Logging
// ============================================
static void Log(const char* fmt, ...) {
if (!g_logFile) {
fopen_s(&g_logFile, "avx2emu.log", "w");
if (!g_logFile) return;
}
va_list args;
va_start(args, fmt);
vfprintf(g_logFile, fmt, args);
va_end(args);
fflush(g_logFile);
}
// ============================================
// YMM Register Access
// ============================================
// The FX 6300 supports AVX1, so YMM registers exist in CONTEXT
// We need CONTEXT_XSTATE to access them properly
#ifndef XSTATE_MASK_AVX
#define XSTATE_MASK_AVX (1ULL << 2)
#endif
typedef struct DECLSPEC_ALIGN(16) _M256 {
__m128 lo;
__m128 hi;
} M256;
// Get pointer to YMM register's storage
// YMM0-15 are stored in the XSTATE area
static bool GetYmmRegister(CONTEXT* ctx, int reg, __m256i* out) {
if (reg < 0 || reg > 15) return false;
DWORD64 featureLength = 0;
auto* ymmArea = (M256*)LocateXStateFeature(ctx, XSTATE_AVX, (DWORD*)&featureLength);
if (!ymmArea) {
// Fallback: XMM is in the legacy area, YMM high bits might not be accessible
// This shouldn't happen if we set CONTEXT_XSTATE
return false;
}
// XMM (low 128) is in FltSave, YMM high 128 is in XSTATE
__m128* xmmRegs = (__m128*)&ctx->FltSave.XmmRegisters[0];
__m128 lo = xmmRegs[reg];
__m128 hi = *((__m128*)&ymmArea[reg]);
*out = _mm256_set_m128(hi, lo);
return true;
}
static bool SetYmmRegister(CONTEXT* ctx, int reg, __m256i val) {
if (reg < 0 || reg > 15) return false;
DWORD64 featureLength = 0;
auto* ymmArea = (M256*)LocateXStateFeature(ctx, XSTATE_AVX, (DWORD*)&featureLength);
if (!ymmArea) return false;
__m128* xmmRegs = (__m128*)&ctx->FltSave.XmmRegisters[0];
xmmRegs[reg] = _mm256_castsi256_si128(val);
*((__m128i*)&ymmArea[reg]) = _mm256_extractf128_si256(val, 1);
// Mark XSTATE as modified
SetXStateFeaturesMask(ctx, XSTATE_MASK_AVX);
return true;
}
static int ZydisRegToIndex(ZydisRegister reg) {
if (reg >= ZYDIS_REGISTER_YMM0 && reg <= ZYDIS_REGISTER_YMM15)
return reg - ZYDIS_REGISTER_YMM0;
if (reg >= ZYDIS_REGISTER_XMM0 && reg <= ZYDIS_REGISTER_XMM15)
return reg - ZYDIS_REGISTER_XMM0;
return -1;
}
// ============================================
// GPR Access
// ============================================
static uint64_t GetGPR(CONTEXT* ctx, ZydisRegister reg) {
switch (reg) {
case ZYDIS_REGISTER_RAX: case ZYDIS_REGISTER_EAX: case ZYDIS_REGISTER_AX: case ZYDIS_REGISTER_AL: return ctx->Rax;
case ZYDIS_REGISTER_RBX: case ZYDIS_REGISTER_EBX: case ZYDIS_REGISTER_BX: case ZYDIS_REGISTER_BL: return ctx->Rbx;
case ZYDIS_REGISTER_RCX: case ZYDIS_REGISTER_ECX: case ZYDIS_REGISTER_CX: case ZYDIS_REGISTER_CL: return ctx->Rcx;
case ZYDIS_REGISTER_RDX: case ZYDIS_REGISTER_EDX: case ZYDIS_REGISTER_DX: case ZYDIS_REGISTER_DL: return ctx->Rdx;
case ZYDIS_REGISTER_RSI: case ZYDIS_REGISTER_ESI: case ZYDIS_REGISTER_SI: case ZYDIS_REGISTER_SIL: return ctx->Rsi;
case ZYDIS_REGISTER_RDI: case ZYDIS_REGISTER_EDI: case ZYDIS_REGISTER_DI: case ZYDIS_REGISTER_DIL: return ctx->Rdi;
case ZYDIS_REGISTER_RBP: case ZYDIS_REGISTER_EBP: case ZYDIS_REGISTER_BP: case ZYDIS_REGISTER_BPL: return ctx->Rbp;
case ZYDIS_REGISTER_RSP: case ZYDIS_REGISTER_ESP: case ZYDIS_REGISTER_SP: case ZYDIS_REGISTER_SPL: return ctx->Rsp;
case ZYDIS_REGISTER_R8: case ZYDIS_REGISTER_R8D: case ZYDIS_REGISTER_R8W: case ZYDIS_REGISTER_R8B: return ctx->R8;
case ZYDIS_REGISTER_R9: case ZYDIS_REGISTER_R9D: case ZYDIS_REGISTER_R9W: case ZYDIS_REGISTER_R9B: return ctx->R9;
case ZYDIS_REGISTER_R10: case ZYDIS_REGISTER_R10D: case ZYDIS_REGISTER_R10W: case ZYDIS_REGISTER_R10B: return ctx->R10;
case ZYDIS_REGISTER_R11: case ZYDIS_REGISTER_R11D: case ZYDIS_REGISTER_R11W: case ZYDIS_REGISTER_R11B: return ctx->R11;
case ZYDIS_REGISTER_R12: case ZYDIS_REGISTER_R12D: case ZYDIS_REGISTER_R12W: case ZYDIS_REGISTER_R12B: return ctx->R12;
case ZYDIS_REGISTER_R13: case ZYDIS_REGISTER_R13D: case ZYDIS_REGISTER_R13W: case ZYDIS_REGISTER_R13B: return ctx->R13;
case ZYDIS_REGISTER_R14: case ZYDIS_REGISTER_R14D: case ZYDIS_REGISTER_R14W: case ZYDIS_REGISTER_R14B: return ctx->R14;
case ZYDIS_REGISTER_R15: case ZYDIS_REGISTER_R15D: case ZYDIS_REGISTER_R15W: case ZYDIS_REGISTER_R15B: return ctx->R15;
default: return 0;
}
}
// ============================================
// Memory Operand Resolution
// ============================================
static void* ResolveMemory(CONTEXT* ctx, const ZydisDecodedOperand* op,
const ZydisDecodedInstruction* instr) {
uint64_t addr = 0;
if (op->mem.base != ZYDIS_REGISTER_NONE) {
if (op->mem.base == ZYDIS_REGISTER_RIP) {
addr = ctx->Rip + instr->length;
} else {
addr = GetGPR(ctx, op->mem.base);
}
}
if (op->mem.index != ZYDIS_REGISTER_NONE) {
uint64_t idx = GetGPR(ctx, op->mem.index);
addr += idx * op->mem.scale;
}
if (op->mem.disp.has_displacement) {
addr += op->mem.disp.value;
}
return (void*)addr;
}
// ============================================
// AVX2 Instruction Emulation
// Adapted from sw-simd by Anna Henningsen
// ============================================
// Helper: split 256 into 128s for AVX1 ops
#define SPLIT_256(v, lo, hi) \
__m128i lo = _mm256_castsi256_si128(v); \
__m128i hi = _mm256_extractf128_si256(v, 1)
#define MERGE_256(lo, hi) \
_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)
// --- VPADDB/W/D/Q 256-bit ---
static __m256i emu_vpaddb(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_add_epi8(a_lo, b_lo), _mm_add_epi8(a_hi, b_hi));
}
static __m256i emu_vpaddw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_add_epi16(a_lo, b_lo), _mm_add_epi16(a_hi, b_hi));
}
static __m256i emu_vpaddd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_add_epi32(a_lo, b_lo), _mm_add_epi32(a_hi, b_hi));
}
static __m256i emu_vpaddq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_add_epi64(a_lo, b_lo), _mm_add_epi64(a_hi, b_hi));
}
// --- VPSUBB/W/D/Q 256-bit ---
static __m256i emu_vpsubb(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_sub_epi8(a_lo, b_lo), _mm_sub_epi8(a_hi, b_hi));
}
static __m256i emu_vpsubw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_sub_epi16(a_lo, b_lo), _mm_sub_epi16(a_hi, b_hi));
}
static __m256i emu_vpsubd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_sub_epi32(a_lo, b_lo), _mm_sub_epi32(a_hi, b_hi));
}
static __m256i emu_vpsubq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_sub_epi64(a_lo, b_lo), _mm_sub_epi64(a_hi, b_hi));
}
// --- Bitwise 256-bit ---
static __m256i emu_vpand(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_and_si128(a_lo, b_lo), _mm_and_si128(a_hi, b_hi));
}
static __m256i emu_vpor(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_or_si128(a_lo, b_lo), _mm_or_si128(a_hi, b_hi));
}
static __m256i emu_vpxor(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_xor_si128(a_lo, b_lo), _mm_xor_si128(a_hi, b_hi));
}
static __m256i emu_vpandn(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_andnot_si128(a_lo, b_lo), _mm_andnot_si128(a_hi, b_hi));
}
// --- Compare 256-bit ---
static __m256i emu_vpcmpeqb(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpeq_epi8(a_lo, b_lo), _mm_cmpeq_epi8(a_hi, b_hi));
}
static __m256i emu_vpcmpeqw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpeq_epi16(a_lo, b_lo), _mm_cmpeq_epi16(a_hi, b_hi));
}
static __m256i emu_vpcmpeqd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpeq_epi32(a_lo, b_lo), _mm_cmpeq_epi32(a_hi, b_hi));
}
static __m256i emu_vpcmpeqq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpeq_epi64(a_lo, b_lo), _mm_cmpeq_epi64(a_hi, b_hi));
}
static __m256i emu_vpcmpgtb(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpgt_epi8(a_lo, b_lo), _mm_cmpgt_epi8(a_hi, b_hi));
}
static __m256i emu_vpcmpgtw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpgt_epi16(a_lo, b_lo), _mm_cmpgt_epi16(a_hi, b_hi));
}
static __m256i emu_vpcmpgtd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpgt_epi32(a_lo, b_lo), _mm_cmpgt_epi32(a_hi, b_hi));
}
static __m256i emu_vpcmpgtq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_cmpgt_epi64(a_lo, b_lo), _mm_cmpgt_epi64(a_hi, b_hi));
}
// --- Shifts (uniform count) 256-bit ---
static __m256i emu_vpsllw(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_sll_epi16(a_lo, count), _mm_sll_epi16(a_hi, count));
}
static __m256i emu_vpslld(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_sll_epi32(a_lo, count), _mm_sll_epi32(a_hi, count));
}
static __m256i emu_vpsllq(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_sll_epi64(a_lo, count), _mm_sll_epi64(a_hi, count));
}
static __m256i emu_vpsrlw(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_srl_epi16(a_lo, count), _mm_srl_epi16(a_hi, count));
}
static __m256i emu_vpsrld(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_srl_epi32(a_lo, count), _mm_srl_epi32(a_hi, count));
}
static __m256i emu_vpsrlq(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_srl_epi64(a_lo, count), _mm_srl_epi64(a_hi, count));
}
static __m256i emu_vpsraw(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_sra_epi16(a_lo, count), _mm_sra_epi16(a_hi, count));
}
static __m256i emu_vpsrad(__m256i a, __m128i count) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_sra_epi32(a_lo, count), _mm_sra_epi32(a_hi, count));
}
// --- Variable shifts (AVX2-only) ---
static __m256i emu_vpsllvd(__m256i a, __m256i count) {
uint32_t* s = (uint32_t*)&a;
uint32_t* c = (uint32_t*)&count;
uint32_t result[8];
for (int i = 0; i < 8; i++) {
result[i] = (c[i] < 32) ? (s[i] << c[i]) : 0;
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpsllvq(__m256i a, __m256i count) {
uint64_t* s = (uint64_t*)&a;
uint64_t* c = (uint64_t*)&count;
uint64_t result[4];
for (int i = 0; i < 4; i++) {
result[i] = (c[i] < 64) ? (s[i] << c[i]) : 0;
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpsrlvd(__m256i a, __m256i count) {
uint32_t* s = (uint32_t*)&a;
uint32_t* c = (uint32_t*)&count;
uint32_t result[8];
for (int i = 0; i < 8; i++) {
result[i] = (c[i] < 32) ? (s[i] >> c[i]) : 0;
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpsrlvq(__m256i a, __m256i count) {
uint64_t* s = (uint64_t*)&a;
uint64_t* c = (uint64_t*)&count;
uint64_t result[4];
for (int i = 0; i < 4; i++) {
result[i] = (c[i] < 64) ? (s[i] >> c[i]) : 0;
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpsravd(__m256i a, __m256i count) {
int32_t* s = (int32_t*)&a;
uint32_t* c = (uint32_t*)&count;
int32_t result[8];
for (int i = 0; i < 8; i++) {
result[i] = (c[i] < 32) ? (s[i] >> c[i]) : (s[i] >> 31);
}
return _mm256_loadu_si256((__m256i*)result);
}
// --- Permute (AVX2-only) ---
static __m256i emu_vpermd(__m256i idx, __m256i src) {
int32_t* s = (int32_t*)&src;
int32_t* i = (int32_t*)&idx;
int32_t result[8];
for (int j = 0; j < 8; j++) {
result[j] = s[i[j] & 7];
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpermq(__m256i src, uint8_t imm) {
int64_t* s = (int64_t*)&src;
int64_t result[4];
result[0] = s[(imm >> 0) & 3];
result[1] = s[(imm >> 2) & 3];
result[2] = s[(imm >> 4) & 3];
result[3] = s[(imm >> 6) & 3];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256 emu_vpermps(__m256i idx, __m256 src) {
float* s = (float*)&src;
int32_t* i = (int32_t*)&idx;
float result[8];
for (int j = 0; j < 8; j++) {
result[j] = s[i[j] & 7];
}
return _mm256_loadu_ps(result);
}
static __m256d emu_vpermpd(__m256d src, uint8_t imm) {
double* s = (double*)&src;
double result[4];
result[0] = s[(imm >> 0) & 3];
result[1] = s[(imm >> 2) & 3];
result[2] = s[(imm >> 4) & 3];
result[3] = s[(imm >> 6) & 3];
return _mm256_loadu_pd(result);
}
// --- Broadcast (AVX2-only) ---
static __m256i emu_vpbroadcastb(int8_t val) {
int8_t result[32];
for (int i = 0; i < 32; i++) result[i] = val;
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpbroadcastw(int16_t val) {
int16_t result[16];
for (int i = 0; i < 16; i++) result[i] = val;
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpbroadcastd(int32_t val) {
int32_t result[8];
for (int i = 0; i < 8; i++) result[i] = val;
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpbroadcastq(int64_t val) {
int64_t result[4];
for (int i = 0; i < 4; i++) result[i] = val;
return _mm256_loadu_si256((__m256i*)result);
}
// --- Gather (AVX2-only) ---
static __m256i emu_vpgatherdd(void* base, __m256i indices, __m256i mask, int scale) {
int32_t* idx = (int32_t*)&indices;
int32_t* m = (int32_t*)&mask;
int32_t result[8] = {0};
for (int i = 0; i < 8; i++) {
if (m[i] & 0x80000000) {
int32_t* ptr = (int32_t*)((uint8_t*)base + (int64_t)idx[i] * scale);
result[i] = *ptr;
}
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpgatherdq(void* base, __m128i indices, __m256i mask, int scale) {
int32_t* idx = (int32_t*)&indices;
int64_t* m = (int64_t*)&mask;
int64_t result[4] = {0};
for (int i = 0; i < 4; i++) {
if (m[i] & 0x8000000000000000ULL) {
int64_t* ptr = (int64_t*)((uint8_t*)base + (int64_t)idx[i] * scale);
result[i] = *ptr;
}
}
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpgatherqd(void* base, __m256i indices, __m128i mask, int scale) {
int64_t* idx = (int64_t*)&indices;
int32_t* m = (int32_t*)&mask;
int32_t result[4] = {0};
for (int i = 0; i < 4; i++) {
if (m[i] & 0x80000000) {
int32_t* ptr = (int32_t*)((uint8_t*)base + idx[i] * scale);
result[i] = *ptr;
}
}
return _mm_loadu_si128((__m128i*)result);
}
static __m256i emu_vpgatherqq(void* base, __m256i indices, __m256i mask, int scale) {
int64_t* idx = (int64_t*)&indices;
int64_t* m = (int64_t*)&mask;
int64_t result[4] = {0};
for (int i = 0; i < 4; i++) {
if (m[i] & 0x8000000000000000ULL) {
int64_t* ptr = (int64_t*)((uint8_t*)base + idx[i] * scale);
result[i] = *ptr;
}
}
return _mm256_loadu_si256((__m256i*)result);
}
// --- Float gathers ---
static __m256 emu_vgatherdps(void* base, __m256i indices, __m256 mask, int scale) {
int32_t* idx = (int32_t*)&indices;
int32_t* m = (int32_t*)&mask;
float result[8] = {0};
for (int i = 0; i < 8; i++) {
if (m[i] & 0x80000000) {
float* ptr = (float*)((uint8_t*)base + (int64_t)idx[i] * scale);
result[i] = *ptr;
}
}
return _mm256_loadu_ps(result);
}
static __m256d emu_vgatherdpd(void* base, __m128i indices, __m256d mask, int scale) {
int32_t* idx = (int32_t*)&indices;
int64_t* m = (int64_t*)&mask;
double result[4] = {0};
for (int i = 0; i < 4; i++) {
if (m[i] & 0x8000000000000000ULL) {
double* ptr = (double*)((uint8_t*)base + (int64_t)idx[i] * scale);
result[i] = *ptr;
}
}
return _mm256_loadu_pd(result);
}
// --- Blend (VPBLENDD is AVX2) ---
static __m256i emu_vpblendd(__m256i a, __m256i b, uint8_t imm) {
int32_t* as = (int32_t*)&a;
int32_t* bs = (int32_t*)&b;
int32_t result[8];
for (int i = 0; i < 8; i++) {
result[i] = (imm & (1 << i)) ? bs[i] : as[i];
}
return _mm256_loadu_si256((__m256i*)result);
}
// --- Pack/Unpack 256-bit ---
static __m256i emu_vpunpcklbw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpacklo_epi8(a_lo, b_lo), _mm_unpacklo_epi8(a_hi, b_hi));
}
static __m256i emu_vpunpckhbw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpackhi_epi8(a_lo, b_lo), _mm_unpackhi_epi8(a_hi, b_hi));
}
static __m256i emu_vpunpcklwd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpacklo_epi16(a_lo, b_lo), _mm_unpacklo_epi16(a_hi, b_hi));
}
static __m256i emu_vpunpckhwd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpackhi_epi16(a_lo, b_lo), _mm_unpackhi_epi16(a_hi, b_hi));
}
static __m256i emu_vpunpckldq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpacklo_epi32(a_lo, b_lo), _mm_unpacklo_epi32(a_hi, b_hi));
}
static __m256i emu_vpunpckhdq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpackhi_epi32(a_lo, b_lo), _mm_unpackhi_epi32(a_hi, b_hi));
}
static __m256i emu_vpunpcklqdq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpacklo_epi64(a_lo, b_lo), _mm_unpacklo_epi64(a_hi, b_hi));
}
static __m256i emu_vpunpckhqdq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_unpackhi_epi64(a_lo, b_lo), _mm_unpackhi_epi64(a_hi, b_hi));
}
// --- Multiply ---
static __m256i emu_vpmulld(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_mullo_epi32(a_lo, b_lo), _mm_mullo_epi32(a_hi, b_hi));
}
static __m256i emu_vpmullw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_mullo_epi16(a_lo, b_lo), _mm_mullo_epi16(a_hi, b_hi));
}
static __m256i emu_vpmulhw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_mulhi_epi16(a_lo, b_lo), _mm_mulhi_epi16(a_hi, b_hi));
}
static __m256i emu_vpmuludq(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_mul_epu32(a_lo, b_lo), _mm_mul_epu32(a_hi, b_hi));
}
// --- Insert/Extract 128 (VINSERTI128/VEXTRACTI128 are AVX2) ---
static __m256i emu_vinserti128(__m256i a, __m128i b, uint8_t imm) {
if (imm & 1) {
return _mm256_insertf128_si256(a, b, 1);
} else {
return _mm256_insertf128_si256(a, b, 0);
}
}
static __m128i emu_vextracti128(__m256i a, uint8_t imm) {
if (imm & 1) {
return _mm256_extractf128_si256(a, 1);
} else {
return _mm256_castsi256_si128(a);
}
}
// --- Min/Max 256-bit ---
static __m256i emu_vpminsb(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_min_epi8(a_lo, b_lo), _mm_min_epi8(a_hi, b_hi));
}
static __m256i emu_vpminub(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_min_epu8(a_lo, b_lo), _mm_min_epu8(a_hi, b_hi));
}
static __m256i emu_vpminsw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_min_epi16(a_lo, b_lo), _mm_min_epi16(a_hi, b_hi));
}
static __m256i emu_vpminuw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_min_epu16(a_lo, b_lo), _mm_min_epu16(a_hi, b_hi));
}
static __m256i emu_vpminsd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_min_epi32(a_lo, b_lo), _mm_min_epi32(a_hi, b_hi));
}
static __m256i emu_vpminud(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_min_epu32(a_lo, b_lo), _mm_min_epu32(a_hi, b_hi));
}
static __m256i emu_vpmaxsb(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_max_epi8(a_lo, b_lo), _mm_max_epi8(a_hi, b_hi));
}
static __m256i emu_vpmaxub(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_max_epu8(a_lo, b_lo), _mm_max_epu8(a_hi, b_hi));
}
static __m256i emu_vpmaxsw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_max_epi16(a_lo, b_lo), _mm_max_epi16(a_hi, b_hi));
}
static __m256i emu_vpmaxuw(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_max_epu16(a_lo, b_lo), _mm_max_epu16(a_hi, b_hi));
}
static __m256i emu_vpmaxsd(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_max_epi32(a_lo, b_lo), _mm_max_epi32(a_hi, b_hi));
}
static __m256i emu_vpmaxud(__m256i a, __m256i b) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(b, b_lo, b_hi);
return MERGE_256(_mm_max_epu32(a_lo, b_lo), _mm_max_epu32(a_hi, b_hi));
}
// --- Shuffle ---
static __m256i emu_vpshufb(__m256i a, __m256i mask) {
SPLIT_256(a, a_lo, a_hi);
SPLIT_256(mask, m_lo, m_hi);
return MERGE_256(_mm_shuffle_epi8(a_lo, m_lo), _mm_shuffle_epi8(a_hi, m_hi));
}
static __m256i emu_vpshufd(__m256i a, uint8_t imm) {
SPLIT_256(a, a_lo, a_hi);
// We can't use _mm_shuffle_epi32 with runtime imm, so scalar fallback
int32_t* lo = (int32_t*)&a_lo;
int32_t* hi = (int32_t*)&a_hi;
int32_t result[8];
result[0] = lo[(imm >> 0) & 3];
result[1] = lo[(imm >> 2) & 3];
result[2] = lo[(imm >> 4) & 3];
result[3] = lo[(imm >> 6) & 3];
result[4] = hi[(imm >> 0) & 3];
result[5] = hi[(imm >> 2) & 3];
result[6] = hi[(imm >> 4) & 3];
result[7] = hi[(imm >> 6) & 3];
return _mm256_loadu_si256((__m256i*)result);
}
// --- Abs 256-bit ---
static __m256i emu_vpabsb(__m256i a) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_abs_epi8(a_lo), _mm_abs_epi8(a_hi));
}
static __m256i emu_vpabsw(__m256i a) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_abs_epi16(a_lo), _mm_abs_epi16(a_hi));
}
static __m256i emu_vpabsd(__m256i a) {
SPLIT_256(a, a_lo, a_hi);
return MERGE_256(_mm_abs_epi32(a_lo), _mm_abs_epi32(a_hi));
}
// --- Sign extend 256-bit (VPMOVSXBD etc.) ---
static __m256i emu_vpmovsxbw(__m128i a) {
__m128i lo = _mm_cvtepi8_epi16(a);
__m128i hi = _mm_cvtepi8_epi16(_mm_srli_si128(a, 8));
return MERGE_256(lo, hi);
}
static __m256i emu_vpmovsxbd(__m128i a) {
// sign-extend 8 bytes to 8 dwords
int8_t* src = (int8_t*)&a;
int32_t result[8];
for (int i = 0; i < 8; i++) result[i] = src[i];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpmovsxbq(__m128i a) {
int8_t* src = (int8_t*)&a;
int64_t result[4];
for (int i = 0; i < 4; i++) result[i] = src[i];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpmovsxwd(__m128i a) {
__m128i lo = _mm_cvtepi16_epi32(a);
__m128i hi = _mm_cvtepi16_epi32(_mm_srli_si128(a, 8));
return MERGE_256(lo, hi);
}
static __m256i emu_vpmovsxwq(__m128i a) {
int16_t* src = (int16_t*)&a;
int64_t result[4];
for (int i = 0; i < 4; i++) result[i] = src[i];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpmovsxdq(__m128i a) {
__m128i lo = _mm_cvtepi32_epi64(a);
__m128i hi = _mm_cvtepi32_epi64(_mm_srli_si128(a, 8));
return MERGE_256(lo, hi);
}
// --- Zero extend 256-bit ---
static __m256i emu_vpmovzxbw(__m128i a) {
__m128i lo = _mm_cvtepu8_epi16(a);
__m128i hi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
return MERGE_256(lo, hi);
}
static __m256i emu_vpmovzxbd(__m128i a) {
uint8_t* src = (uint8_t*)&a;
uint32_t result[8];
for (int i = 0; i < 8; i++) result[i] = src[i];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpmovzxbq(__m128i a) {
uint8_t* src = (uint8_t*)&a;
uint64_t result[4];
for (int i = 0; i < 4; i++) result[i] = src[i];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpmovzxwd(__m128i a) {
__m128i lo = _mm_cvtepu16_epi32(a);
__m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
return MERGE_256(lo, hi);
}
static __m256i emu_vpmovzxwq(__m128i a) {
uint16_t* src = (uint16_t*)&a;
uint64_t result[4];
for (int i = 0; i < 4; i++) result[i] = src[i];
return _mm256_loadu_si256((__m256i*)result);
}
static __m256i emu_vpmovzxdq(__m128i a) {
__m128i lo = _mm_cvtepu32_epi64(a);
__m128i hi = _mm_cvtepu32_epi64(_mm_srli_si128(a, 8));
return MERGE_256(lo, hi);
}
// ============================================
// Main dispatcher
// ============================================
static bool EmulateInstruction(CONTEXT* ctx, ZydisDecodedInstruction* instr,
ZydisDecodedOperand* operands) {
// Helper to fetch YMM operand (register or memory)
auto fetchYmm = [&](int opIdx) -> __m256i {
if (operands[opIdx].type == ZYDIS_OPERAND_TYPE_REGISTER) {
__m256i val;
GetYmmRegister(ctx, ZydisRegToIndex(operands[opIdx].reg.value), &val);
return val;
} else {
return _mm256_loadu_si256((__m256i*)ResolveMemory(ctx, &operands[opIdx], instr));
}
};
auto fetchXmm = [&](int opIdx) -> __m128i {
if (operands[opIdx].type == ZYDIS_OPERAND_TYPE_REGISTER) {
__m256i val;
GetYmmRegister(ctx, ZydisRegToIndex(operands[opIdx].reg.value), &val);
return _mm256_castsi256_si128(val);
} else {
return _mm_loadu_si128((__m128i*)ResolveMemory(ctx, &operands[opIdx], instr));
}
};
int dstIdx = ZydisRegToIndex(operands[0].reg.value);
__m256i result;
bool handled = false;
switch (instr->mnemonic) {
// 256-bit integer arithmetic
case ZYDIS_MNEMONIC_VPADDB:
if (instr->operand_width == 256) {
result = emu_vpaddb(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPADDW:
if (instr->operand_width == 256) {
result = emu_vpaddw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPADDD:
if (instr->operand_width == 256) {
result = emu_vpaddd(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPADDQ:
if (instr->operand_width == 256) {
result = emu_vpaddq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSUBB:
if (instr->operand_width == 256) {
result = emu_vpsubb(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSUBW:
if (instr->operand_width == 256) {
result = emu_vpsubw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSUBD:
if (instr->operand_width == 256) {
result = emu_vpsubd(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSUBQ:
if (instr->operand_width == 256) {
result = emu_vpsubq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
// Bitwise
case ZYDIS_MNEMONIC_VPAND:
if (instr->operand_width == 256) {
result = emu_vpand(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPOR:
if (instr->operand_width == 256) {
result = emu_vpor(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPXOR:
if (instr->operand_width == 256) {
result = emu_vpxor(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPANDN:
if (instr->operand_width == 256) {
result = emu_vpandn(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
// Compare
case ZYDIS_MNEMONIC_VPCMPEQB:
if (instr->operand_width == 256) {
result = emu_vpcmpeqb(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPEQW:
if (instr->operand_width == 256) {
result = emu_vpcmpeqw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPEQD:
if (instr->operand_width == 256) {
result = emu_vpcmpeqd(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPEQQ:
if (instr->operand_width == 256) {
result = emu_vpcmpeqq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPGTB:
if (instr->operand_width == 256) {
result = emu_vpcmpgtb(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPGTW:
if (instr->operand_width == 256) {
result = emu_vpcmpgtw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPGTD:
if (instr->operand_width == 256) {
result = emu_vpcmpgtd(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPCMPGTQ:
if (instr->operand_width == 256) {
result = emu_vpcmpgtq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
// Variable shifts (AVX2 only)
case ZYDIS_MNEMONIC_VPSLLVD:
result = emu_vpsllvd(fetchYmm(1), fetchYmm(2));
handled = true;
break;
case ZYDIS_MNEMONIC_VPSLLVQ:
result = emu_vpsllvq(fetchYmm(1), fetchYmm(2));
handled = true;
break;
case ZYDIS_MNEMONIC_VPSRLVD:
result = emu_vpsrlvd(fetchYmm(1), fetchYmm(2));
handled = true;
break;
case ZYDIS_MNEMONIC_VPSRLVQ:
result = emu_vpsrlvq(fetchYmm(1), fetchYmm(2));
handled = true;
break;
case ZYDIS_MNEMONIC_VPSRAVD:
result = emu_vpsravd(fetchYmm(1), fetchYmm(2));
handled = true;
break;
// Uniform shifts 256-bit
case ZYDIS_MNEMONIC_VPSLLW:
if (instr->operand_width == 256) {
result = emu_vpsllw(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSLLD:
if (instr->operand_width == 256) {
result = emu_vpslld(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSLLQ:
if (instr->operand_width == 256) {
result = emu_vpsllq(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSRLW:
if (instr->operand_width == 256) {
result = emu_vpsrlw(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSRLD:
if (instr->operand_width == 256) {
result = emu_vpsrld(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSRLQ:
if (instr->operand_width == 256) {
result = emu_vpsrlq(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSRAW:
if (instr->operand_width == 256) {
result = emu_vpsraw(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSRAD:
if (instr->operand_width == 256) {
result = emu_vpsrad(fetchYmm(1), fetchXmm(2));
handled = true;
}
break;
// Permute (AVX2 only)
case ZYDIS_MNEMONIC_VPERMD:
result = emu_vpermd(fetchYmm(1), fetchYmm(2));
handled = true;
break;
case ZYDIS_MNEMONIC_VPERMQ:
result = emu_vpermq(fetchYmm(1), (uint8_t)operands[2].imm.value.u);
handled = true;
break;
case ZYDIS_MNEMONIC_VPERMPS:
result = _mm256_castps_si256(emu_vpermps(fetchYmm(1), _mm256_castsi256_ps(fetchYmm(2))));
handled = true;
break;
case ZYDIS_MNEMONIC_VPERMPD:
result = _mm256_castpd_si256(emu_vpermpd(_mm256_castsi256_pd(fetchYmm(1)), (uint8_t)operands[2].imm.value.u));
handled = true;
break;
// Broadcast (AVX2 only)
case ZYDIS_MNEMONIC_VPBROADCASTB: {
int8_t val;
if (operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER) {
__m128i src = fetchXmm(1);
val = ((int8_t*)&src)[0];
} else {
val = *(int8_t*)ResolveMemory(ctx, &operands[1], instr);
}
result = emu_vpbroadcastb(val);
handled = true;
break;
}
case ZYDIS_MNEMONIC_VPBROADCASTW: {
int16_t val;
if (operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER) {
__m128i src = fetchXmm(1);
val = ((int16_t*)&src)[0];
} else {
val = *(int16_t*)ResolveMemory(ctx, &operands[1], instr);
}
result = emu_vpbroadcastw(val);
handled = true;
break;
}
case ZYDIS_MNEMONIC_VPBROADCASTD: {
int32_t val;
if (operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER) {
__m128i src = fetchXmm(1);
val = ((int32_t*)&src)[0];
} else {
val = *(int32_t*)ResolveMemory(ctx, &operands[1], instr);
}
result = emu_vpbroadcastd(val);
handled = true;
break;
}
case ZYDIS_MNEMONIC_VPBROADCASTQ: {
int64_t val;
if (operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER) {
__m128i src = fetchXmm(1);
val = ((int64_t*)&src)[0];
} else {
val = *(int64_t*)ResolveMemory(ctx, &operands[1], instr);
}
result = emu_vpbroadcastq(val);
handled = true;
break;
}
// Gather (AVX2 only) - these are tricky because dst/mask update
case ZYDIS_MNEMONIC_VPGATHERDD: {
void* base = ResolveMemory(ctx, &operands[1], instr);
__m256i indices;
GetYmmRegister(ctx, ZydisRegToIndex(operands[1].mem.index), &indices);
__m256i mask = fetchYmm(2);
int scale = operands[1].mem.scale;
result = emu_vpgatherdd(base, indices, mask, scale);
// Clear mask register
__m256i zeroMask = _mm256_setzero_si256();
SetYmmRegister(ctx, ZydisRegToIndex(operands[2].reg.value), zeroMask);
handled = true;
break;
}
// Blend
case ZYDIS_MNEMONIC_VPBLENDD:
result = emu_vpblendd(fetchYmm(1), fetchYmm(2), (uint8_t)operands[3].imm.value.u);
handled = true;
break;
// Unpack
case ZYDIS_MNEMONIC_VPUNPCKLBW:
if (instr->operand_width == 256) {
result = emu_vpunpcklbw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKHBW:
if (instr->operand_width == 256) {
result = emu_vpunpckhbw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKLWD:
if (instr->operand_width == 256) {
result = emu_vpunpcklwd(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKHWD:
if (instr->operand_width == 256) {
result = emu_vpunpckhwd(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKLDQ:
if (instr->operand_width == 256) {
result = emu_vpunpckldq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKHDQ:
if (instr->operand_width == 256) {
result = emu_vpunpckhdq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKLQDQ:
if (instr->operand_width == 256) {
result = emu_vpunpcklqdq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPUNPCKHQDQ:
if (instr->operand_width == 256) {
result = emu_vpunpckhqdq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
// Multiply
case ZYDIS_MNEMONIC_VPMULLD:
if (instr->operand_width == 256) {
result = emu_vpmulld(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPMULLW:
if (instr->operand_width == 256) {
result = emu_vpmullw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPMULHW:
if (instr->operand_width == 256) {
result = emu_vpmulhw(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPMULUDQ:
if (instr->operand_width == 256) {
result = emu_vpmuludq(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
// Insert/Extract
case ZYDIS_MNEMONIC_VINSERTI128:
result = emu_vinserti128(fetchYmm(1), fetchXmm(2), (uint8_t)operands[3].imm.value.u);
handled = true;
break;
case ZYDIS_MNEMONIC_VEXTRACTI128: {
__m128i extracted = emu_vextracti128(fetchYmm(1), (uint8_t)operands[2].imm.value.u);
if (operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER) {
result = _mm256_castsi128_si256(extracted);
// Zero upper 128 bits
result = _mm256_insertf128_si256(result, _mm_setzero_si128(), 1);
} else {
_mm_storeu_si128((__m128i*)ResolveMemory(ctx, &operands[0], instr), extracted);
return true; // Don't set YMM
}
handled = true;
break;
}
// Min/Max
case ZYDIS_MNEMONIC_VPMINSB:
if (instr->operand_width == 256) { result = emu_vpminsb(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMINUB:
if (instr->operand_width == 256) { result = emu_vpminub(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMINSW:
if (instr->operand_width == 256) { result = emu_vpminsw(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMINUW:
if (instr->operand_width == 256) { result = emu_vpminuw(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMINSD:
if (instr->operand_width == 256) { result = emu_vpminsd(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMINUD:
if (instr->operand_width == 256) { result = emu_vpminud(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMAXSB:
if (instr->operand_width == 256) { result = emu_vpmaxsb(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMAXUB:
if (instr->operand_width == 256) { result = emu_vpmaxub(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMAXSW:
if (instr->operand_width == 256) { result = emu_vpmaxsw(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMAXUW:
if (instr->operand_width == 256) { result = emu_vpmaxuw(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMAXSD:
if (instr->operand_width == 256) { result = emu_vpmaxsd(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPMAXUD:
if (instr->operand_width == 256) { result = emu_vpmaxud(fetchYmm(1), fetchYmm(2)); handled = true; }
break;
// Shuffle
case ZYDIS_MNEMONIC_VPSHUFB:
if (instr->operand_width == 256) {
result = emu_vpshufb(fetchYmm(1), fetchYmm(2));
handled = true;
}
break;
case ZYDIS_MNEMONIC_VPSHUFD:
if (instr->operand_width == 256) {
result = emu_vpshufd(fetchYmm(1), (uint8_t)operands[2].imm.value.u);
handled = true;
}
break;
// Abs
case ZYDIS_MNEMONIC_VPABSB:
if (instr->operand_width == 256) { result = emu_vpabsb(fetchYmm(1)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPABSW:
if (instr->operand_width == 256) { result = emu_vpabsw(fetchYmm(1)); handled = true; }
break;
case ZYDIS_MNEMONIC_VPABSD:
if (instr->operand_width == 256) { result = emu_vpabsd(fetchYmm(1)); handled = true; }
break;
// Sign/zero extend
case ZYDIS_MNEMONIC_VPMOVSXBW:
result = emu_vpmovsxbw(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVSXBD:
result = emu_vpmovsxbd(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVSXBQ:
result = emu_vpmovsxbq(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVSXWD:
result = emu_vpmovsxwd(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVSXWQ:
result = emu_vpmovsxwq(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVSXDQ:
result = emu_vpmovsxdq(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVZXBW:
result = emu_vpmovzxbw(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVZXBD:
result = emu_vpmovzxbd(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVZXBQ:
result = emu_vpmovzxbq(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVZXWD:
result = emu_vpmovzxwd(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVZXWQ:
result = emu_vpmovzxwq(fetchXmm(1)); handled = true;
break;
case ZYDIS_MNEMONIC_VPMOVZXDQ:
result = emu_vpmovzxdq(fetchXmm(1)); handled = true;
break;
default:
break;
}
if (handled) {
SetYmmRegister(ctx, dstIdx, result);
InterlockedIncrement64(&g_emulatedCount);
}
return handled;
}
// ============================================
// Exception handler
// ============================================
LONG CALLBACK AVX2ExceptionHandler(EXCEPTION_POINTERS* ep) {
if (ep->ExceptionRecord->ExceptionCode != EXCEPTION_ILLEGAL_INSTRUCTION)
return EXCEPTION_CONTINUE_SEARCH;
uint8_t* rip = (uint8_t*)ep->ContextRecord->Rip;
// Quick VEX prefix check
if (rip[0] != 0xC4 && rip[0] != 0xC5)
return EXCEPTION_CONTINUE_SEARCH;
ZydisDecodedInstruction instr;
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
if (!ZYAN_SUCCESS(ZydisDecoderDecodeFull(&g_decoder, rip, 15, &instr, operands)))
return EXCEPTION_CONTINUE_SEARCH;
if (EmulateInstruction(ep->ContextRecord, &instr, operands)) {
ep->ContextRecord->Rip += instr.length;
return EXCEPTION_CONTINUE_EXECUTION;
}
// Log unhandled instruction
InterlockedIncrement64(&g_missedCount);
char buf[256];
ZydisFormatter formatter;
ZydisFormatterInit(&formatter, ZYDIS_FORMATTER_STYLE_INTEL);
ZydisFormatterFormatInstruction(&formatter, &instr, operands, instr.operand_count, buf, sizeof(buf), ep->ContextRecord->Rip, ZYAN_NULL);
Log("UNHANDLED @ 0x%llX: %s\n", ep->ContextRecord->Rip, buf);
return EXCEPTION_CONTINUE_SEARCH;
}
// ============================================
// Public API
// ============================================
void InstallAVX2Handler() {
ZydisDecoderInit(&g_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64);
g_initialized = true;
g_handlerHandle = AddVectoredExceptionHandler(1, AVX2ExceptionHandler);
Log("AVX2Emu installed\n");
}
void RemoveAVX2Handler() {
if (g_handlerHandle) {
RemoveVectoredExceptionHandler(g_handlerHandle);
g_handlerHandle = nullptr;
}
Log("AVX2Emu removed. Emulated: %lld, Missed: %lld\n", g_emulatedCount, g_missedCount);
if (g_logFile) {
fclose(g_logFile);
g_logFile = nullptr;
}
}
#pragma once
void InstallAVX2Handler();
void RemoveAVX2Handler();
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include "avx2emu.h"
#include "version_proxy.h"
BOOL WINAPI DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) {
switch (reason) {
case DLL_PROCESS_ATTACH:
DisableThreadLibraryCalls(hinst);
InitVersionProxy();
InstallAVX2Handler();
break;
case DLL_PROCESS_DETACH:
RemoveAVX2Handler();
FreeVersionProxy();
break;
}
return TRUE;
}
LIBRARY version
EXPORTS
GetFileVersionInfoA=Export_GetFileVersionInfoA
GetFileVersionInfoByHandle=Export_GetFileVersionInfoByHandle
GetFileVersionInfoExA=Export_GetFileVersionInfoExA
GetFileVersionInfoExW=Export_GetFileVersionInfoExW
GetFileVersionInfoSizeA=Export_GetFileVersionInfoSizeA
GetFileVersionInfoSizeExA=Export_GetFileVersionInfoSizeExA
GetFileVersionInfoSizeExW=Export_GetFileVersionInfoSizeExW
GetFileVersionInfoSizeW=Export_GetFileVersionInfoSizeW
GetFileVersionInfoW=Export_GetFileVersionInfoW
VerFindFileA=Export_VerFindFileA
VerFindFileW=Export_VerFindFileW
VerInstallFileA=Export_VerInstallFileA
VerInstallFileW=Export_VerInstallFileW
VerLanguageNameA=Export_VerLanguageNameA
VerLanguageNameW=Export_VerLanguageNameW
VerQueryValueA=Export_VerQueryValueA
VerQueryValueW=Export_VerQueryValueW
#include "version_proxy.h"
static HMODULE g_realVersion = nullptr;
// Function pointers
static FARPROC fp_GetFileVersionInfoA = nullptr;
static FARPROC fp_GetFileVersionInfoByHandle = nullptr;
static FARPROC fp_GetFileVersionInfoExA = nullptr;
static FARPROC fp_GetFileVersionInfoExW = nullptr;
static FARPROC fp_GetFileVersionInfoSizeA = nullptr;
static FARPROC fp_GetFileVersionInfoSizeExA = nullptr;
static FARPROC fp_GetFileVersionInfoSizeExW = nullptr;
static FARPROC fp_GetFileVersionInfoSizeW = nullptr;
static FARPROC fp_GetFileVersionInfoW = nullptr;
static FARPROC fp_VerFindFileA = nullptr;
static FARPROC fp_VerFindFileW = nullptr;
static FARPROC fp_VerInstallFileA = nullptr;
static FARPROC fp_VerInstallFileW = nullptr;
static FARPROC fp_VerLanguageNameA = nullptr;
static FARPROC fp_VerLanguageNameW = nullptr;
static FARPROC fp_VerQueryValueA = nullptr;
static FARPROC fp_VerQueryValueW = nullptr;
void InitVersionProxy() {
char sysPath[MAX_PATH];
GetSystemDirectoryA(sysPath, MAX_PATH);
strcat_s(sysPath, "\\version.dll");
g_realVersion = LoadLibraryA(sysPath);
if (!g_realVersion) {
MessageBoxA(NULL, "Failed to load system version.dll", "AVX2Emu", MB_ICONERROR);
return;
}
fp_GetFileVersionInfoA = GetProcAddress(g_realVersion, "GetFileVersionInfoA");
fp_GetFileVersionInfoByHandle = GetProcAddress(g_realVersion, "GetFileVersionInfoByHandle");
fp_GetFileVersionInfoExA = GetProcAddress(g_realVersion, "GetFileVersionInfoExA");
fp_GetFileVersionInfoExW = GetProcAddress(g_realVersion, "GetFileVersionInfoExW");
fp_GetFileVersionInfoSizeA = GetProcAddress(g_realVersion, "GetFileVersionInfoSizeA");
fp_GetFileVersionInfoSizeExA = GetProcAddress(g_realVersion, "GetFileVersionInfoSizeExA");
fp_GetFileVersionInfoSizeExW = GetProcAddress(g_realVersion, "GetFileVersionInfoSizeExW");
fp_GetFileVersionInfoSizeW = GetProcAddress(g_realVersion, "GetFileVersionInfoSizeW");
fp_GetFileVersionInfoW = GetProcAddress(g_realVersion, "GetFileVersionInfoW");
fp_VerFindFileA = GetProcAddress(g_realVersion, "VerFindFileA");
fp_VerFindFileW = GetProcAddress(g_realVersion, "VerFindFileW");
fp_VerInstallFileA = GetProcAddress(g_realVersion, "VerInstallFileA");
fp_VerInstallFileW = GetProcAddress(g_realVersion, "VerInstallFileW");
fp_VerLanguageNameA = GetProcAddress(g_realVersion, "VerLanguageNameA");
fp_VerLanguageNameW = GetProcAddress(g_realVersion, "VerLanguageNameW");
fp_VerQueryValueA = GetProcAddress(g_realVersion, "VerQueryValueA");
fp_VerQueryValueW = GetProcAddress(g_realVersion, "VerQueryValueW");
}
void FreeVersionProxy() {
if (g_realVersion) {
FreeLibrary(g_realVersion);
g_realVersion = nullptr;
}
}
// Forwarding functions
extern "C" {
BOOL WINAPI Export_GetFileVersionInfoA(LPCSTR lptstrFilename, DWORD dwHandle, DWORD dwLen, LPVOID lpData) {
typedef BOOL(WINAPI* fn)(LPCSTR, DWORD, DWORD, LPVOID);
return ((fn)fp_GetFileVersionInfoA)(lptstrFilename, dwHandle, dwLen, lpData);
}
BOOL WINAPI Export_GetFileVersionInfoByHandle(DWORD dwFlags, HANDLE hFile, DWORD dwLen, LPVOID lpData) {
typedef BOOL(WINAPI* fn)(DWORD, HANDLE, DWORD, LPVOID);
return ((fn)fp_GetFileVersionInfoByHandle)(dwFlags, hFile, dwLen, lpData);
}
BOOL WINAPI Export_GetFileVersionInfoExA(DWORD dwFlags, LPCSTR lpwstrFilename, DWORD dwHandle, DWORD dwLen, LPVOID lpData) {
typedef BOOL(WINAPI* fn)(DWORD, LPCSTR, DWORD, DWORD, LPVOID);
return ((fn)fp_GetFileVersionInfoExA)(dwFlags, lpwstrFilename, dwHandle, dwLen, lpData);
}
BOOL WINAPI Export_GetFileVersionInfoExW(DWORD dwFlags, LPCWSTR lpwstrFilename, DWORD dwHandle, DWORD dwLen, LPVOID lpData) {
typedef BOOL(WINAPI* fn)(DWORD, LPCWSTR, DWORD, DWORD, LPVOID);
return ((fn)fp_GetFileVersionInfoExW)(dwFlags, lpwstrFilename, dwHandle, dwLen, lpData);
}
DWORD WINAPI Export_GetFileVersionInfoSizeA(LPCSTR lptstrFilename, LPDWORD lpdwHandle) {
typedef DWORD(WINAPI* fn)(LPCSTR, LPDWORD);
return ((fn)fp_GetFileVersionInfoSizeA)(lptstrFilename, lpdwHandle);
}
DWORD WINAPI Export_GetFileVersionInfoSizeExA(DWORD dwFlags, LPCSTR lpwstrFilename, LPDWORD lpdwHandle) {
typedef DWORD(WINAPI* fn)(DWORD, LPCSTR, LPDWORD);
return ((fn)fp_GetFileVersionInfoSizeExA)(dwFlags, lpwstrFilename, lpdwHandle);
}
DWORD WINAPI Export_GetFileVersionInfoSizeExW(DWORD dwFlags, LPCWSTR lpwstrFilename, LPDWORD lpdwHandle) {
typedef DWORD(WINAPI* fn)(DWORD, LPCWSTR, LPDWORD);
return ((fn)fp_GetFileVersionInfoSizeExW)(dwFlags, lpwstrFilename, lpdwHandle);
}
DWORD WINAPI Export_GetFileVersionInfoSizeW(LPCWSTR lptstrFilename, LPDWORD lpdwHandle) {
typedef DWORD(WINAPI* fn)(LPCWSTR, LPDWORD);
return ((fn)fp_GetFileVersionInfoSizeW)(lptstrFilename, lpdwHandle);
}
BOOL WINAPI Export_GetFileVersionInfoW(LPCWSTR lptstrFilename, DWORD dwHandle, DWORD dwLen, LPVOID lpData) {
typedef BOOL(WINAPI* fn)(LPCWSTR, DWORD, DWORD, LPVOID);
return ((fn)fp_GetFileVersionInfoW)(lptstrFilename, dwHandle, dwLen, lpData);
}
DWORD WINAPI Export_VerFindFileA(DWORD uFlags, LPCSTR szFileName, LPCSTR szWinDir, LPCSTR szAppDir, LPSTR szCurDir, PUINT puCurDirLen, LPSTR szDestDir, PUINT puDestDirLen) {
typedef DWORD(WINAPI* fn)(DWORD, LPCSTR, LPCSTR, LPCSTR, LPSTR, PUINT, LPSTR, PUINT);
return ((fn)fp_VerFindFileA)(uFlags, szFileName, szWinDir, szAppDir, szCurDir, puCurDirLen, szDestDir, puDestDirLen);
}
DWORD WINAPI Export_VerFindFileW(DWORD uFlags, LPCWSTR szFileName, LPCWSTR szWinDir, LPCWSTR szAppDir, LPWSTR szCurDir, PUINT puCurDirLen, LPWSTR szDestDir, PUINT puDestDirLen) {
typedef DWORD(WINAPI* fn)(DWORD, LPCWSTR, LPCWSTR, LPCWSTR, LPWSTR, PUINT, LPWSTR, PUINT);
return ((fn)fp_VerFindFileW)(uFlags, szFileName, szWinDir, szAppDir, szCurDir, puCurDirLen, szDestDir, puDestDirLen);
}
DWORD WINAPI Export_VerInstallFileA(DWORD uFlags, LPCSTR szSrcFileName, LPCSTR szDestFileName, LPCSTR szSrcDir, LPCSTR szDestDir, LPCSTR szCurDir, LPSTR szTmpFile, PUINT puTmpFileLen) {
typedef DWORD(WINAPI* fn)(DWORD, LPCSTR, LPCSTR, LPCSTR, LPCSTR, LPCSTR, LPSTR, PUINT);
return ((fn)fp_VerInstallFileA)(uFlags, szSrcFileName, szDestFileName, szSrcDir, szDestDir, szCurDir, szTmpFile, puTmpFileLen);
}
DWORD WINAPI Export_VerInstallFileW(DWORD uFlags, LPCWSTR szSrcFileName, LPCWSTR szDestFileName, LPCWSTR szSrcDir, LPCWSTR szDestDir, LPCWSTR szCurDir, LPWSTR szTmpFile, PUINT puTmpFileLen) {
typedef DWORD(WINAPI* fn)(DWORD, LPCWSTR, LPCWSTR, LPCWSTR, LPCWSTR, LPCWSTR, LPWSTR, PUINT);
return ((fn)fp_VerInstallFileW)(uFlags, szSrcFileName, szDestFileName, szSrcDir, szDestDir, szCurDir, szTmpFile, puTmpFileLen);
}
DWORD WINAPI Export_VerLanguageNameA(DWORD wLang, LPSTR szLang, DWORD cchLang) {
typedef DWORD(WINAPI* fn)(DWORD, LPSTR, DWORD);
return ((fn)fp_VerLanguageNameA)(wLang, szLang, cchLang);
}
DWORD WINAPI Export_VerLanguageNameW(DWORD wLang, LPWSTR szLang, DWORD cchLang) {
typedef DWORD(WINAPI* fn)(DWORD, LPWSTR, DWORD);
return ((fn)fp_VerLanguageNameW)(wLang, szLang, cchLang);
}
BOOL WINAPI Export_VerQueryValueA(LPCVOID pBlock, LPCSTR lpSubBlock, LPVOID* lplpBuffer, PUINT puLen) {
typedef BOOL(WINAPI* fn)(LPCVOID, LPCSTR, LPVOID*, PUINT);
return ((fn)fp_VerQueryValueA)(pBlock, lpSubBlock, lplpBuffer, puLen);
}
BOOL WINAPI Export_VerQueryValueW(LPCVOID pBlock, LPCWSTR lpSubBlock, LPVOID* lplpBuffer, PUINT puLen) {
typedef BOOL(WINAPI* fn)(LPCVOID, LPCWSTR, LPVOID*, PUINT);
return ((fn)fp_VerQueryValueW)(pBlock, lpSubBlock, lplpBuffer, puLen);
}
} // extern "C"
#pragma once
#include <Windows.h>
void InitVersionProxy();
void FreeVersionProxy();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment