There wo go mike:
Code: Select all
extern uint64_t plus1dir[64];
extern uint64_t plus7dir[64];
extern uint64_t plus8dir[64];
extern uint64_t plus9dir[64];
extern uint64_t minus1dir[64];
extern uint64_t minus7dir[64];
extern uint64_t minus8dir[64];
extern uint64_t minus9dir[64];
#define LSB(v) __builtin_ctzll(v)
#define MSB(v) (63 - __builtin_clzll(v))
#define AttacksBishop(square, occ) \
(plus7dir[square] ^ plus7dir[LSB(plus7dir[square] & (occ))] | \
plus9dir[square] ^ plus9dir[LSB(plus9dir[square] & (occ))] | \
minus7dir[square] ^ minus7dir[MSB(minus7dir[square] & (occ))] | \
minus9dir[square] ^ minus9dir[MSB(minus9dir[square] & (occ))])
uint64_t rookAttack(uint64_t occ, uint64_t sq) {
return AttacksBishop(sq, occ);
//return arrAttacks[arrRookBase[sq] + _pext_u64(occ, arrRookMask[sq])];
}
Yields this assembly:
Code: Select all
rookAttack(unsigned long, unsigned long):
mov rdx, QWORD PTR plus7dir[0+rsi*8]
mov r8, QWORD PTR plus9dir[0+rsi*8]
push rbx
mov ecx, 63
mov rax, QWORD PTR minus7dir[0+rsi*8]
mov rsi, QWORD PTR minus9dir[0+rsi*8]
mov r11d, ecx
mov r10, rdx
mov r9, r8
mov rbx, rax
and r10, rdi
and r9, rdi
and rbx, rdi
and rdi, rsi
tzcnt r10, r10
tzcnt r9, r9
lzcnt rbx, rbx
lzcnt rdi, rdi
xor rdx, QWORD PTR plus7dir[0+r10*8]
xor r8, QWORD PTR plus9dir[0+r9*8]
sub r11d, ebx
sub ecx, edi
pop rbx
movsx r11, r11d
movsx rcx, ecx
xor rax, QWORD PTR minus7dir[0+r11*8]
xor rsi, QWORD PTR minus9dir[0+rcx*8]
or rdx, r8
or rax, rsi
or rax, rdx
ret
VS this is all of PEXT lookup:
Code: Select all
mov rax, QWORD PTR AttackPtr[rip]
pext rdi, rdi, QWORD PTR Mask[rip]
mov rax, QWORD PTR [rax+rdi*8]
ret
Performance can definitely be answered because you have 8x indirect lookups into L1 and a hard dependency chain.
Pext only needs a single lookup into L2 and is done - and as I said earlier this latency will not lead to a stall if the compiler is smart enough.. which it is.
If pext is not available the normal hashing will also be faster since it only replaces one pext with imul + 2x shifts.
Greetings - and thank you for asking
