Last active
May 20, 2025 14:59
-
-
Save zeroday0619/9fad9ad321f43ac3094824591bcc8484 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define _GNU_SOURCE | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <pthread.h> | |
| #include <math.h> | |
| #include <immintrin.h> | |
| #include <sys/sysinfo.h> | |
| #include <unistd.h> | |
| #include <time.h> | |
| #include <sched.h> | |
| #include <numa.h> | |
| #include <numaif.h> | |
| #define CACHELINE_SIZE 64 | |
| #define FLOAT_ARR_SIZE 16 | |
| #define INT_ARR_SIZE 8 | |
| void print_float_array(const char *label, float *array, int size); | |
| void print_int_array(const char *label, int *array, int size); | |
| void fill_random_data(float *float_array, int *int_array, int float_size, int int_size) { | |
| for (int i = 0; i < float_size; i++) | |
| float_array[i] = ((float)rand() / RAND_MAX) * 100.0f; | |
| for (int i = 0; i < int_size; i++) | |
| int_array[i] = rand() % 1000; | |
| } | |
| void print_float_array(const char *label, float *array, int size) { | |
| printf(" %s: [", label); | |
| for (int i = 0; i < size; i++) { | |
| printf("%.1f%s", array[i], i == size-1 ? "" : ", "); | |
| } | |
| printf("]\n"); | |
| } | |
| void print_int_array(const char *label, int *array, int size) { | |
| printf(" %s: [", label); | |
| for (int i = 0; i < size; i++) { | |
| printf("%d%s", array[i], i == size-1 ? "" : ", "); | |
| } | |
| printf("]\n"); | |
| } | |
| void set_thread_affinity(long thread_id, int num_cores) { | |
| cpu_set_t cpuset; | |
| CPU_ZERO(&cpuset); | |
| int phys_core = thread_id % (num_cores/2); | |
| int logical_core = (thread_id < num_cores/2) ? phys_core * 2 : phys_core * 2 + 1; | |
| CPU_SET(logical_core, &cpuset); | |
| pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); | |
| } | |
| void *stress_test(void *arg) { | |
| long thread_id = (long)arg; | |
| int num_cores = get_nprocs(); | |
| set_thread_affinity(thread_id, num_cores); | |
| int maxnode = numa_max_node(); | |
| int node = thread_id % (maxnode + 1); | |
| numa_run_on_node(node); | |
| float *random_float_data = numa_alloc_onnode(FLOAT_ARR_SIZE * sizeof(float), node); | |
| int *random_int_data = numa_alloc_onnode(INT_ARR_SIZE * sizeof(int), node); | |
| float sse_result[4] __attribute__((aligned(16))); | |
| float avx_result[8] __attribute__((aligned(32))); | |
| int avx2_result[8] __attribute__((aligned(32))); | |
| if (!random_float_data || !random_int_data) { | |
| fprintf(stderr, "Memory allocation failed\n"); | |
| pthread_exit(NULL); | |
| } | |
| fill_random_data(random_float_data, random_int_data, FLOAT_ARR_SIZE, INT_ARR_SIZE); | |
| __m128 sse_vec_a = _mm_load_ps(&random_float_data[0]); | |
| __m128 sse_vec_b = _mm_load_ps(&random_float_data[4]); | |
| __m256 avx_vec_a = _mm256_load_ps(&random_float_data[0]); | |
| __m256 avx_vec_b = _mm256_load_ps(&random_float_data[8]); | |
| __m256i avx2_vec_a = _mm256_load_si256((__m256i *)&random_int_data[0]); | |
| __m256i avx2_vec_b = _mm256_load_si256((__m256i *)&random_int_data[0]); | |
| __m64 mmx_vec_a = _mm_set_pi32(random_int_data[0], random_int_data[1]); | |
| __m64 mmx_vec_b = _mm_set_pi32(random_int_data[2], random_int_data[3]); | |
| __m64 mmx_vec_result; | |
| __m128 sse_vec_result; | |
| __m256 avx_vec_result; | |
| __m256i avx2_vec_result; | |
| double scalar_result = 0.5; | |
| unsigned long long counter = 0; | |
| while (1) { | |
| _mm_prefetch((const char*)&random_float_data[8], _MM_HINT_T0); | |
| mmx_vec_result = _mm_add_pi32(mmx_vec_a, mmx_vec_b); | |
| sse_vec_result = _mm_add_ps(sse_vec_a, sse_vec_b); | |
| _mm_store_ps(sse_result, sse_vec_result); | |
| avx_vec_result = _mm256_add_ps(avx_vec_a, avx_vec_b); | |
| _mm256_store_ps(avx_result, avx_vec_result); | |
| avx2_vec_result = _mm256_add_epi32(avx2_vec_a, avx2_vec_b); | |
| _mm256_store_si256((__m256i *)avx2_result, avx2_vec_result); | |
| scalar_result += sin(0.5) * cos(0.5); | |
| counter++; | |
| if (counter % 10000000 == 0) { | |
| printf("\nThread %ld (Node %d) Results:\n", thread_id, node); | |
| printf(" MMX result: [%d, %d]\n", ((int *)&mmx_vec_result)[0], ((int *)&mmx_vec_result)[1]); | |
| print_float_array("SSE result", sse_result, 4); | |
| print_float_array("AVX result", avx_result, 8); | |
| print_int_array("AVX2 result", avx2_result, 8); | |
| fill_random_data(random_float_data, random_int_data, FLOAT_ARR_SIZE, INT_ARR_SIZE); | |
| sse_vec_a = _mm_load_ps(&random_float_data[0]); | |
| sse_vec_b = _mm_load_ps(&random_float_data[4]); | |
| avx_vec_a = _mm256_load_ps(&random_float_data[0]); | |
| avx_vec_b = _mm256_load_ps(&random_float_data[8]); | |
| avx2_vec_a = _mm256_load_si256((__m256i *)&random_int_data[0]); | |
| avx2_vec_b = _mm256_load_si256((__m256i *)&random_int_data[0]); | |
| mmx_vec_a = _mm_set_pi32(random_int_data[0], random_int_data[1]); | |
| mmx_vec_b = _mm_set_pi32(random_int_data[2], random_int_data[3]); | |
| } | |
| } | |
| _mm_empty(); | |
| numa_free(random_float_data, FLOAT_ARR_SIZE * sizeof(float)); | |
| numa_free(random_int_data, INT_ARR_SIZE * sizeof(int)); | |
| pthread_exit(NULL); | |
| } | |
| int main() { | |
| srand(time(NULL)); | |
| if (numa_available() < 0) { | |
| fprintf(stderr, "NUMA not supported on this system!\n"); | |
| return 1; | |
| } | |
| int num_cores = get_nprocs(); | |
| pthread_t *threads = malloc(num_cores * sizeof(pthread_t)); | |
| if (!threads) return EXIT_FAILURE; | |
| for (long i = 0; i < num_cores; ++i) { | |
| if (pthread_create(&threads[i], NULL, stress_test, (void *)i)) { | |
| fprintf(stderr, "Thread create error %ld\n", i); | |
| free(threads); | |
| return EXIT_FAILURE; | |
| } | |
| } | |
| for (int i = 0; i < num_cores; ++i) | |
| pthread_join(threads[i], NULL); | |
| free(threads); | |
| return 0; | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
gcc -O2 -march=native stress_test.c -o stress_test -lpthread -lnuma -lm -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2