Skip to content

Instantly share code, notes, and snippets.

@zeroday0619
Last active May 20, 2025 14:59
Show Gist options
  • Select an option

  • Save zeroday0619/9fad9ad321f43ac3094824591bcc8484 to your computer and use it in GitHub Desktop.

Select an option

Save zeroday0619/9fad9ad321f43ac3094824591bcc8484 to your computer and use it in GitHub Desktop.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <math.h>
#include <immintrin.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <time.h>
#include <sched.h>
#include <numa.h>
#include <numaif.h>
#define CACHELINE_SIZE 64
#define FLOAT_ARR_SIZE 16
#define INT_ARR_SIZE 8
void print_float_array(const char *label, float *array, int size);
void print_int_array(const char *label, int *array, int size);
void fill_random_data(float *float_array, int *int_array, int float_size, int int_size) {
for (int i = 0; i < float_size; i++)
float_array[i] = ((float)rand() / RAND_MAX) * 100.0f;
for (int i = 0; i < int_size; i++)
int_array[i] = rand() % 1000;
}
void print_float_array(const char *label, float *array, int size) {
printf(" %s: [", label);
for (int i = 0; i < size; i++) {
printf("%.1f%s", array[i], i == size-1 ? "" : ", ");
}
printf("]\n");
}
void print_int_array(const char *label, int *array, int size) {
printf(" %s: [", label);
for (int i = 0; i < size; i++) {
printf("%d%s", array[i], i == size-1 ? "" : ", ");
}
printf("]\n");
}
void set_thread_affinity(long thread_id, int num_cores) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
int phys_core = thread_id % (num_cores/2);
int logical_core = (thread_id < num_cores/2) ? phys_core * 2 : phys_core * 2 + 1;
CPU_SET(logical_core, &cpuset);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
}
void *stress_test(void *arg) {
long thread_id = (long)arg;
int num_cores = get_nprocs();
set_thread_affinity(thread_id, num_cores);
int maxnode = numa_max_node();
int node = thread_id % (maxnode + 1);
numa_run_on_node(node);
float *random_float_data = numa_alloc_onnode(FLOAT_ARR_SIZE * sizeof(float), node);
int *random_int_data = numa_alloc_onnode(INT_ARR_SIZE * sizeof(int), node);
float sse_result[4] __attribute__((aligned(16)));
float avx_result[8] __attribute__((aligned(32)));
int avx2_result[8] __attribute__((aligned(32)));
if (!random_float_data || !random_int_data) {
fprintf(stderr, "Memory allocation failed\n");
pthread_exit(NULL);
}
fill_random_data(random_float_data, random_int_data, FLOAT_ARR_SIZE, INT_ARR_SIZE);
__m128 sse_vec_a = _mm_load_ps(&random_float_data[0]);
__m128 sse_vec_b = _mm_load_ps(&random_float_data[4]);
__m256 avx_vec_a = _mm256_load_ps(&random_float_data[0]);
__m256 avx_vec_b = _mm256_load_ps(&random_float_data[8]);
__m256i avx2_vec_a = _mm256_load_si256((__m256i *)&random_int_data[0]);
__m256i avx2_vec_b = _mm256_load_si256((__m256i *)&random_int_data[0]);
__m64 mmx_vec_a = _mm_set_pi32(random_int_data[0], random_int_data[1]);
__m64 mmx_vec_b = _mm_set_pi32(random_int_data[2], random_int_data[3]);
__m64 mmx_vec_result;
__m128 sse_vec_result;
__m256 avx_vec_result;
__m256i avx2_vec_result;
double scalar_result = 0.5;
unsigned long long counter = 0;
while (1) {
_mm_prefetch((const char*)&random_float_data[8], _MM_HINT_T0);
mmx_vec_result = _mm_add_pi32(mmx_vec_a, mmx_vec_b);
sse_vec_result = _mm_add_ps(sse_vec_a, sse_vec_b);
_mm_store_ps(sse_result, sse_vec_result);
avx_vec_result = _mm256_add_ps(avx_vec_a, avx_vec_b);
_mm256_store_ps(avx_result, avx_vec_result);
avx2_vec_result = _mm256_add_epi32(avx2_vec_a, avx2_vec_b);
_mm256_store_si256((__m256i *)avx2_result, avx2_vec_result);
scalar_result += sin(0.5) * cos(0.5);
counter++;
if (counter % 10000000 == 0) {
printf("\nThread %ld (Node %d) Results:\n", thread_id, node);
printf(" MMX result: [%d, %d]\n", ((int *)&mmx_vec_result)[0], ((int *)&mmx_vec_result)[1]);
print_float_array("SSE result", sse_result, 4);
print_float_array("AVX result", avx_result, 8);
print_int_array("AVX2 result", avx2_result, 8);
fill_random_data(random_float_data, random_int_data, FLOAT_ARR_SIZE, INT_ARR_SIZE);
sse_vec_a = _mm_load_ps(&random_float_data[0]);
sse_vec_b = _mm_load_ps(&random_float_data[4]);
avx_vec_a = _mm256_load_ps(&random_float_data[0]);
avx_vec_b = _mm256_load_ps(&random_float_data[8]);
avx2_vec_a = _mm256_load_si256((__m256i *)&random_int_data[0]);
avx2_vec_b = _mm256_load_si256((__m256i *)&random_int_data[0]);
mmx_vec_a = _mm_set_pi32(random_int_data[0], random_int_data[1]);
mmx_vec_b = _mm_set_pi32(random_int_data[2], random_int_data[3]);
}
}
_mm_empty();
numa_free(random_float_data, FLOAT_ARR_SIZE * sizeof(float));
numa_free(random_int_data, INT_ARR_SIZE * sizeof(int));
pthread_exit(NULL);
}
int main() {
srand(time(NULL));
if (numa_available() < 0) {
fprintf(stderr, "NUMA not supported on this system!\n");
return 1;
}
int num_cores = get_nprocs();
pthread_t *threads = malloc(num_cores * sizeof(pthread_t));
if (!threads) return EXIT_FAILURE;
for (long i = 0; i < num_cores; ++i) {
if (pthread_create(&threads[i], NULL, stress_test, (void *)i)) {
fprintf(stderr, "Thread create error %ld\n", i);
free(threads);
return EXIT_FAILURE;
}
}
for (int i = 0; i < num_cores; ++i)
pthread_join(threads[i], NULL);
free(threads);
return 0;
}
@zeroday0619
Copy link
Author

zeroday0619 commented Mar 24, 2025

gcc -O2 -march=native stress_test.c -o stress_test -lpthread -lnuma -lm -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment