This source with vc2005 results in assembly which looks closest to the hand made one. Other versions are either more sequential or use more than the seven scratch registers, so that it has to save/restore caller safe registers (rbx) on the stack...
Code: Select all
#pragma intrinsic(_BitScanForward64)
#pragma intrinsic(_BitScanReverse64)
typedef struct
{
u64 bitsN;
u64 bitsE;
u64 bitsS;
u64 bitsW;
unsigned short index[64];
} SQUARE;
u64 rookAttacks(u64 occ, SQUARE *sq) {
u64 n, e, s, w;
unsigned long ln, le;
occ |= 0x8100000000000081;
n = occ & sq->bitsN;
e = occ & sq->bitsE;
s = occ & sq->bitsS;
w = occ & sq->bitsW;
_BitScanForward64(&ln, n);
_BitScanForward64(&le, e);
_BitScanReverse64((unsigned long*)&s, s);
_BitScanReverse64((unsigned long*)&w, w);
u32 idx = sq->index[ln] | sq->index[le] | sq->index[s] | sq->index[w];
return rookLookup[idx];
}
occ$ = 8
sq$ = 16
?rookAttacks@@YA_K_KPEAUSQUARE@@@Z PROC
00000 4c 8b 52 10 mov r10, QWORD PTR [rdx+16]
00004 4c 8b 42 18 mov r8, QWORD PTR [rdx+24]
00008 48 b8 81 00 00
00 00 00 00 81 mov rax, 8100000000000081H
00012 48 0b c8 or rcx, rax
00015 48 8b 02 mov rax, QWORD PTR [rdx]
00018 4c 8b da mov r11, rdx
0001b 48 23 c1 and rax, rcx
0001e 4c 23 d1 and r10, rcx
00021 4c 23 c1 and r8, rcx
00024 4c 0f bc c8 bsf r9, rax
00028 48 8b 42 08 mov rax, QWORD PTR [rdx+8]
0002c 4d 0f bd c0 bsr r8, r8
00030 48 23 c1 and rax, rcx
00033 4d 0f bd d2 bsr r10, r10
00037 48 0f bc c8 bsf rcx, rax
0003b 0f b7 44 4a 20 movzx eax, WORD PTR [rdx+rcx*2+32]
00040 43 0f b7 4c 43
20 movzx ecx, WORD PTR [r11+r8*2+32]
00046 42 0f b7 54 4a
20 movzx edx, WORD PTR [rdx+r9*2+32]
0004c 48 0b c2 or rax, rdx
0004f 48 0b c1 or rax, rcx
00052 43 0f b7 4c 53
20 movzx ecx, WORD PTR [r11+r10*2+32]
00058 48 0b c1 or rax, rcx
0005b 48 8d 0d 00 00
00 00 lea rcx, OFFSET FLAT:?rookLookup
00062 48 8b 04 c1 mov rax, QWORD PTR [rcx+rax*8]
00066 c3 ret 0
?rookAttacks@@YA_K_KPEAUSQUARE@@@Z ENDP
; Function compile flags: /Ogtpy
This is the additional overhead to build the sq*:
Code: Select all
SQUARE squares[64];
u64 rookAttacks(u64 occ, u32 sq) {
return rookAttacks(occ, squares+sq);
}
occ$ = 8
sq$ = 16
?rookAttacks@@YA_K_KI@Z PROC ; rookAttacks, COMDAT
00000 8b c2 mov eax, edx
00002 4c 8d 14 80 lea r10, QWORD PTR [rax+rax*4]
00006 48 8d 05 00 00
00 00 lea rax, OFFSET FLAT:?squares
0000d 49 c1 e2 05 shl r10, 5
00011 4c 03 d0 add r10, rax
00014 48 b8 81 00 00
00 00 00 00 81 mov rax, 8100000000000081H
0001e 4d 8b 42 18 mov r8, QWORD PTR [r10+24]
00022 4d 8b 4a 10 mov r9, QWORD PTR [r10+16]
00026 48 0b c8 or rcx, rax
00029 49 8b 02 mov rax, QWORD PTR [r10]
0002c 4c 23 c1 and r8, rcx
0002f 4c 23 c9 and r9, rcx
00032 48 23 c1 and rax, rcx
00035 4d 0f bd c0 bsr r8, r8
00039 4d 0f bd c9 bsr r9, r9
0003d 48 0f bc d0 bsf rdx, rax
00041 49 8b 42 08 mov rax, QWORD PTR [r10+8]
00045 41 0f b7 54 52
20 movzx edx, WORD PTR [r10+rdx*2+32]
0004b 48 23 c1 and rax, rcx
0004e 48 0f bc c8 bsf rcx, rax
00052 41 0f b7 44 4a
20 movzx eax, WORD PTR [r10+rcx*2+32]
00058 43 0f b7 4c 42
20 movzx ecx, WORD PTR [r10+r8*2+32]
0005e 48 0b c2 or rax, rdx
00061 48 0b c1 or rax, rcx
00064 43 0f b7 4c 4a
20 movzx ecx, WORD PTR [r10+r9*2+32]
0006a 48 0b c1 or rax, rcx
0006d 48 8d 0d 00 00
00 00 lea rcx, OFFSET FLAT:?rookLookup
00074 48 8b 04 c1 mov rax, QWORD PTR [rcx+rax*8]
00078 c3 ret 0
?rookAttacks@@YA_K_KI@Z ENDP