Gerd Isenberg wrote:
I am too short in time now - what is saw yesterday was quite strange - the vc2005 initilalized some variables on the stack, but failed to load it to the destination registers before the bitscans!? I will have a closer look later...
Gerd
It does not work, if you look to the assembly:
The vc2005 compiler initializes the same local variable on the stack with 0,7,56,63 but fails to initialize the destination registers of the bitscans. As a result of this unspecified usage of bsf/bsr the complete function is serialized!
Code: Select all
typedef struct
{
u64 bits[4];
unsigned short index[64];
} SQUARE;
u64 rookAttacks(u64 occ, SQUARE *sq) {
unsigned long n = 0, e = 7, s = 56, w = 63;
_BitScanForward64(&n, occ & sq->bits[0]);
_BitScanForward64(&e, occ & sq->bits[1]);
_BitScanReverse64(&s, occ & sq->bits[2]);
_BitScanReverse64(&w, occ & sq->bits[3]);
u32 idx = sq->index[n] | sq->index[e] | sq->index[s] | sq->index[w];
return rookLookup[idx];
}
_TEXT SEGMENT
w$ = 8
s$ = 8
e$ = 8
n$ = 8
occ$ = 8
sq$ = 16
?rookAttacks@@YA_K_KPEAUSQUARE@@@Z PROC
00000 48 8b 02 mov rax, QWORD PTR [rdx]
00003 4c 8b d2 mov r10, rdx
00006 c7 44 24 08 00
00 00 00 mov DWORD PTR n$[rsp], 0
0000e 48 23 c1 and rax, rcx
00011 c7 44 24 08 07
00 00 00 mov DWORD PTR e$[rsp], 7
00019 c7 44 24 08 38
00 00 00 mov DWORD PTR s$[rsp], 56
00021 4c 0f bc c8 bsf r9, rax
00025 48 8b 42 08 mov rax, QWORD PTR [rdx+8]
00029 c7 44 24 08 3f
00 00 00 mov DWORD PTR w$[rsp], 63
00031 48 23 c1 and rax, rcx
00034 4c 0f bc c0 bsf r8, rax
00038 48 8b 42 10 mov rax, QWORD PTR [rdx+16]
0003c 48 23 c1 and rax, rcx
0003f 48 0f bd d0 bsr rdx, rax
00043 49 8b 42 18 mov rax, QWORD PTR [r10+24]
00047 41 0f b7 54 52
20 movzx edx, WORD PTR [r10+rdx*2+32]
0004d 48 23 c1 and rax, rcx
00050 48 0f bd c8 bsr rcx, rax
00054 41 0f b7 44 4a
20 movzx eax, WORD PTR [r10+rcx*2+32]
0005a 48 8d 0d 00 00
00 00 lea rcx, OFFSET FLAT:?rookLookup
00061 48 0b c2 or rax, rdx
00064 43 0f b7 54 42
20 movzx edx, WORD PTR [r10+r8*2+32]
0006a 48 0b c2 or rax, rdx
0006d 43 0f b7 54 4a
20 movzx edx, WORD PTR [r10+r9*2+32]
00073 48 0b c2 or rax, rdx
00076 48 8b 04 c1 mov rax, QWORD PTR [rcx+rax*8]
0007a c3 ret 0
?rookAttacks@@YA_K_KPEAUSQUARE@@@Z ENDP
So i fear you have to do something like this to ensure that at least one bit is found by the bitscans.
Code: Select all
typedef struct
{
u64 inner[4];
u64 outer[4];
unsigned short index[64];
} SQUARE;
u64 rookAttacks(u64 occ, SQUARE *sq) {
u64 n = (occ & sq->inner[0]) | sq->outer[0];
u64 e = (occ & sq->inner[1]) | sq->outer[1];
u64 s = (occ & sq->inner[2]) | sq->outer[2];
u64 w = (occ & sq->inner[3]) | sq->outer[3];
_BitScanForward64((unsigned long*)&n, n);
_BitScanForward64((unsigned long*)&e, e);
_BitScanReverse64((unsigned long*)&s, s);
_BitScanReverse64((unsigned long*)&w, w);
u32 idx = sq->index[n] | sq->index[e] | sq->index[s] | sq->index[w];
return rookLookup[idx];
}
_TEXT SEGMENT
occ$ = 8
sq$ = 16
00000 4c 8b 42 10 mov r8, QWORD PTR [rdx+16]
00004 48 8b 42 18 mov rax, QWORD PTR [rdx+24]
00008 4c 8b 0a mov r9, QWORD PTR [rdx]
0000b 4c 8b 52 08 mov r10, QWORD PTR [rdx+8]
0000f 48 23 c1 and rax, rcx
00012 4c 23 c9 and r9, rcx
00015 48 0b 42 38 or rax, QWORD PTR [rdx+56]
00019 4c 0b 4a 20 or r9, QWORD PTR [rdx+32]
0001d 4c 23 c1 and r8, rcx
00020 4c 0b 42 30 or r8, QWORD PTR [rdx+48]
00024 4c 23 d1 and r10, rcx
00027 48 0f bd c0 bsr rax, rax
0002b 4c 0b 52 28 or r10, QWORD PTR [rdx+40]
0002f 4c 8b da mov r11, rdx
00032 49 0f bd c8 bsr rcx, r8
00036 41 0f b7 44 43
40 movzx eax, WORD PTR [r11+rax*2+64]
0003c 41 0f b7 4c 4b
40 movzx ecx, WORD PTR [r11+rcx*2+64]
00042 49 0f bc d2 bsf rdx, r10
00046 48 0b c1 or rax, rcx
00049 41 0f b7 4c 53
40 movzx ecx, WORD PTR [r11+rdx*2+64]
0004f 4d 0f bc c9 bsf r9, r9
00053 48 0b c1 or rax, rcx
00056 43 0f b7 4c 4b
40 movzx ecx, WORD PTR [r11+r9*2+64]
0005c 48 0b c1 or rax, rcx
0005f 48 8d 0d 00 00
00 00 lea rcx, OFFSET FLAT:?rookLookup@@3PA_KA ; rookLookup
00066 48 8b 04 c1 mov rax, QWORD PTR [rcx+rax*8]
0006a c3 ret 0
?rookAttacks@@YA_K_KPEAUSQUARE@@@Z ENDP ; rookAttacks
Of cousre instead of sq->outer[0..3] you may use immediate constants 0xFF00000000000000, 0x8080808080808080, 0x00000000000000FF, 0x0101010101010101, but that blows up the source code, while you can put it in the same cacheline as the inner bits for the potential blockers...
Now you need somebody to count cycles on a core 2 duo.
I better don't make any predictions
Gerd