I read somewhere in this forum that on a NUMA system, you need to divide the TT in a number of parts that equal the number of threads and then for each part do a memset with the appropriate cpu affinity.
Now my threadripper is a numa system:
Code: Select all
folkert@oensoens:~$ lscpu | grep NUMA
NUMA node(s): 2
NUMA node0 CPU(s): 0-15
NUMA node1 CPU(s): 16-31Both on cpu 0:
Code: Select all
folkert@oensoens:~$ ./a.out 0 0
cpu: 0
cpu: 0
9402
Code: Select all
folkert@oensoens:~$ ./a.out 0 16
cpu: 0
cpu: 16
9390
Maybe my test-code is broken?
Code: Select all
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <thread>
#include <time.h>
#define N (1024ll * 1024ll)
#define DT 5000000000ll
void select_core(pthread_t h, int core)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
if (pthread_setaffinity_np(h, sizeof(cpu_set_t), &cpuset))
printf("pthread_setaffinity_np failed\n");
pthread_yield();
printf("cpu: %d\n", sched_getcpu());
}
uint64_t get_ns()
{
struct timespec tp { 0 };
if (clock_gettime(CLOCK_MONOTONIC, &tp) == -1) {
perror("clock_gettime");
return 0;
}
return tp.tv_sec * 1000ll * 1000ll * 1000ll + tp.tv_nsec;
}
int main(int argc, char *argv[])
{
int core1 = atoi(argv[1]);
int core2 = atoi(argv[2]);
select_core(pthread_self(), core1);
uint8_t *p = (uint8_t *)malloc(N);
memset(p, 0x01, N);
select_core(pthread_self(), core2);
uint64_t n = 0;
uint64_t dummy = 0;
uint64_t start_ts = get_ns();
do {
n++;
for(int i=1; i<N; i++) {
if (p[i])
p[i - 1] = dummy++;
}
}
while(get_ns() - start_ts <= DT);
printf("%ld\n", n);
free(p);
return 0;
}