Some further (final?) remarks on those didactical 64*64-lookup replacement-routines to get 1. the kind of ray (if any) of two squares (0..63) and/or 2. the inbetween bitsets. Again they are intended to initialize the lookup arrays or to replace them in high memory traffic situations and/or to relax cache pollution. The routines are based on building boolean {-1,0}-masks of nibble-differences for ...
Code: Select all
fileDifference == 0 -> -1 if same file, else < 15
rankDifference == 0 -> -1 if same rank, else < 15
fileDifference ^ rankDifference == 0 -> -1 if same diagonal, else < 15
fileDifference + rankDifference == 0 -> -1 if same antidiagonal, else < 15.
... and to perform further ~nibble-0-ands to combine (add,xor,or) them to a final result.
There are four independent 
and,dec,and chains - interesting to see what assembly the compiler produces - a kind of benchmark. VC2005 is sensible to minor source code changes and really acts like a high level assembler 
This seems shortest for 32-bit mode - thanks to the one-byte dec-instruction, which is not available in 64-bit mode. To get the dense 5 (instead of 7) as index for same squares, the strange "+^|" -sequence is used, instead of three times "or". 
Code: Select all
// return: 1 if same rank
//         2 if same file
//         3 if same diagonal
//         4 if same antidiagonal
//         5 if same square
//         0 otherwise
u32 onSameRay(u32 sq1, u32 sq2) {
    u32 rankDiff,  fileDiff, 
        antiDiff,  diaxDiff,  rayindex;
    rankDiff  = ((sq2 | 7) -  sq1) >>3;
    fileDiff  = (sq2 &  7) - (sq1 & 7);
    antiDiff  = rankDiff   +  fileDiff;
    // differences as signed nibbles
    rankDiff  = rankDiff   &        15;
    fileDiff  = fileDiff   &        15;
    antiDiff  = antiDiff   &        15;
    diaxDiff  = rankDiff   ^  fileDiff;
    rayindex  = antiDiff-1 &    4 * 16;
    rayindex += diaxDiff-1 &    3 * 16;
    rayindex ^= fileDiff-1 &    2 * 16;
    rayindex |= rankDiff-1 &    1 * 16;
    return rayindex >> 4;
}
Applying De Morgan results in slightly better scheduling as well as shorter code in 64-bit mode, despite the additional not-instruction ...
Code: Select all
// return: 1 if same rank
//         2 if same file
//         3 if same diagonal
//         4 if same antidiagonal
//         7 if same square
//         0 otherwise
u32 onSameRay(u32 sq1, u32 sq2) {
    u32 rankDiff, fileDiff, 
        antiDiff, diaxDiff, rayindex;
    rankDiff  = ((sq2| 7) - sq1) >>3;
    fileDiff  = (sq2 & 7) -(sq1 & 7);
    antiDiff  =  rankDiff + fileDiff;
    rankDiff  =  rankDiff &       15;
    fileDiff  =  fileDiff &       15;
    antiDiff  =  antiDiff &       15;
    diaxDiff  =  rankDiff ^ fileDiff;
    rayindex  = -antiDiff |      ~64;
    rayindex &= -diaxDiff |      ~48;
    rayindex &= -fileDiff |      ~32;
    rayindex &= -rankDiff |      ~16;
    return ~rayindex >> 4;
}
The onSameRay-routine might be used to lookup an 8-bitboard array - to calculate the final inbetween-bitboard by shift and mask. Two additional shifts, xors safe calculating min and max of the two squares ...
Code: Select all
u64 inBetween(u32 sq1, u32 sq2)
{
    const u64 o = 1;
    static const u64 rays[8] =
    {
        0x0000000000000000,
        0x000000000000007e, // 1.rank, inner six
        0x0001010101010100, // a-file, inner six
        0x0040201008040200, // a1h8-diagonal, inner six
        0x0000040810204080, // h1a8-antidiagonal, inner six, considering h1 == 7
        0x0000000000000000, 
        0x0000000000000000,
        0x0000000000000000,
    };
    u64 ray, btw;
    ray = rays[onSameRay(sq1, sq2)];
    ray = (ray<<sq1)   ^ (ray<<sq2);
    btw = ((o<<sq1)-o) ^ ((o<<sq2)-o);
    return ray & btw;
}
how ((1<<sq1)-1) ^ ((1<<sq2)-1) works with b3,f7 to get all bits "between" them including b3:
Code: Select all
 0 0 0 0 0 0 0 0    0 0 0 0 0 0 0 0    0 0 0 0 0 0 0 0    0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0    1 1 1 1 1 0 0 0    0 0 0 0 0 0 0 0    1 1 1 1 1 0 0 0
 0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1    0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1
 0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1    0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1
 0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1    0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1
 0 1 0 0 0 0 0 0    1 1 1 1 1 1 1 1    1 0 0 0 0 0 0 0    0 1 1 1 1 1 1 1
 0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1    1 1 1 1 1 1 1 1    0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0    1 1 1 1 1 1 1 1    1 1 1 1 1 1 1 1    0 0 0 0 0 0 0 0
... but applying min/max seems to result in shorter and faster code (vc2005 needs to load four times cl prior to the variable shifts), since it also schedules with the independent code of the inlined onSameRay-routine:
Code: Select all
u64 inBetween(u32 sq1, u32 sq2)
{
    const u64 o = 1;
    static const u64 rays[8] =
    {
        0x0000000000000000,
        0x000000000000007e, // 1.rank, inner six
        0x0001010101010100, // a-file, inner six
        0x0040201008040200, // a1h8-diagonal, inner six
        0x0000040810204080, // h1a8-antidiagonal, inner six, considering h1 == 7
        0x0000000000000000,
        0x0000000000000000,
        0x0000000000000000,
    };
    u64 ray, btw;
    ray = rays[onSameRay(sq1, sq2)];
    int delta = sq2 - sq1;
    sq1  += delta & (delta >> 31); // min
    sq2  -= delta & (delta >> 31); // max
    btw  = (o << sq2) - o;
    return (ray << sq1) & btw;
}
Direct calculation with immediate 64-bit ray-masks - without any lookups - results in a few more code-bytes (the 10-byte opcodes for the immediate 64-bit loads). It seems better to calculate min/max in advance here. A dependency during startup safes one &15 nibblemask, since rankDiff is always positive. Trying to calculate min/max independently from the ray-Differences to gain more parallelism seems to confuse the optimizer of vc2005 
Code: Select all
u64 inBetween(u32 sq1, u32 sq2)
{   // ensure sq2 >= sq1
    int delta = sq2 - sq1;
    sq1  += delta & (delta >> 31); // min
    sq2  -= delta & (delta >> 31); // max
    // differences as signed nibbles (4 bit)
    //  rankDiff always positive (0..7) sq2>=sq1
    //  fileDiff, antiDiff may negative and require & 15
    u32 rankDiff, fileDiff, antiDiff, diaxDiff;	
    rankDiff  =((sq2 | 7) -  sq1) >>3,
    fileDiff  = (sq2 & 7) - (sq1 & 7);
    antiDiff  = (rankDiff + fileDiff);
    fileDiff &= 15,    antiDiff &= 15;
    diaxDiff  = (rankDiff ^ fileDiff);
    // if difference == 0 -> -1-mask
    // otherwise -> 0..14 ->  0-mask
    const u64 o = 1, m1 = -1;	u64 between; 
    between   = 2 * ((rankDiff  - 1) >> 26);
    between  |= (m1 + fileDiff) & 0x0001010101010100,
    between  |= (m1 + diaxDiff) & 0x8040201008040200,
    between  |= (m1 + antiDiff) & 0x0002040810204080;
    between   = (m1 + (o<<sq2)) & ( between << sq1 );
    return between;
}
One may consider the vertical flip of the 7-bit diagonal/antidiagonal masks, using bswap aka _byteswap_uint64 instead of a third immediate load. The one square "longer" rays get masked off anyway...
Some assembly by vc2005:
32-bit mode - hmm, OOE may help a lot.
Code: Select all
?onSameRay@@YAIII@Z PROC
; _sq1$ = edx
; _sq2$ = eax
  00000	8b c8		 mov	 ecx, eax
  00002	83 e0 07	 and	 eax, 7
  00005	83 c9 07	 or	 ecx, 7
  00008	2b ca		 sub	 ecx, edx
  0000a	83 e2 07	 and	 edx, 7
  0000d	2b c2		 sub	 eax, edx
  0000f	c1 e9 03	 shr	 ecx, 3
  00012	56		 push	 esi
  00013	8b f0		 mov	 esi, eax
  00015	8d 14 0e	 lea	 edx, DWORD PTR [esi+ecx]
  00018	83 e6 0f	 and	 esi, 15	
  0001b	83 e1 0f	 and	 ecx, 15	
  0001e	8b c6		 mov	 eax, esi
  00020	33 c1		 xor	 eax, ecx
  00022	48		 dec	 eax
  00023	83 e0 30	 and	 eax, 48	
  00026	83 e2 0f	 and	 edx, 15	
  00029	4a		 dec	 edx
  0002a	83 e2 40	 and	 edx, 64	
  0002d	03 c2		 add	 eax, edx
  0002f	4e		 dec	 esi
  00030	83 e6 20	 and	 esi, 32	
  00033	33 c6		 xor	 eax, esi
  00035	49		 dec	 ecx
  00036	83 e1 10	 and	 ecx, 16	
  00039	0b c1		 or	 eax, ecx
  0003b	c1 e8 04	 shr	 eax, 4
  0003e	5e		 pop	 esi
  0003f	c3		 ret	 0
64-bit mode: longer opcode due to rex-prefix using r8 and one byte increment opcodes are reserved for prefixes and not available - thus neg is one byte shorter opcode than it's one's complement counterpart sub 1.  
Code: Select all
sq1$ = 8
sq2$ = 16
?onSameRay@@YAIII@Z PROC
  00000	44 8b c2	 mov	 r8d, edx
  00003	83 e2 07	 and	 edx, 7
  00006	41 83 c8 07	 or	 r8d, 7
  0000a	44 2b c1	 sub	 r8d, ecx
  0000d	83 e1 07	 and	 ecx, 7
  00010	2b d1		 sub	 edx, ecx
  00012	41 c1 e8 03	 shr	 r8d, 3
  00016	42 8d 0c 02	 lea	 ecx, DWORD PTR [rdx+r8]
  0001a	83 e2 0f	 and	 edx, 15
  0001d	41 83 e0 0f	 and	 r8d, 15
  00021	8b c2		 mov	 eax, edx
  00023	83 e1 0f	 and	 ecx, 15
  00026	f7 da		 neg	 edx
  00028	41 33 c0	 xor	 eax, r8d
  0002b	83 ca 10	 or	 edx, 16
  0002e	41 f7 d8	 neg	 r8d
  00031	f7 d8		 neg	 eax
  00033	41 83 c8 ef	 or	 r8d, -17
  00037	f7 d9		 neg	 ecx
  00039	23 c2		 and	 eax, edx
  0003b	83 c9 bf	 or	 ecx, -65
  0003e	41 23 c0	 and	 eax, r8d
  00041	83 c8 cf	 or	 eax, -49
  00044	23 c1		 and	 eax, ecx
  00046	f7 d0		 not	 eax
  00048	c1 e8 04	 shr	 eax, 4
  0004b	c3		 ret	 0
 
Code: Select all
sq1$ = 8
sq2$ = 16
?inBetween@@YA_KII@Z PROC
  00000	8b c1		 mov	 eax, ecx
  00002	44 8b c2	 mov	 r8d, edx
  00005	44 8b d1	 mov	 r10d, ecx
  00008	83 e0 07	 and	 eax, 7
  0000b	44 8b ca	 mov	 r9d, edx
  0000e	44 8b da	 mov	 r11d, edx
  00011	41 83 e0 07	 and	 r8d, 7
  00015	41 83 c9 07	 or	 r9d, 7
  00019	44 2b c0	 sub	 r8d, eax
  0001c	44 2b c9	 sub	 r9d, ecx
  0001f	8b c2		 mov	 eax, edx
  00021	41 2b c2	 sub	 eax, r10d
  00024	41 c1 e9 03	 shr	 r9d, 3
  00028	8b d0		 mov	 edx, eax
  0002a	43 8d 0c 08	 lea	 ecx, DWORD PTR [r8+r9]
  0002e	41 83 e0 0f	 and	 r8d, 15
  00032	83 e1 0f	 and	 ecx, 15
  00035	c1 fa 1f	 sar	 edx, 31
  00038	41 83 e1 0f	 and	 r9d, 15
  0003c	23 d0		 and	 edx, eax
  0003e	41 8b c0	 mov	 eax, r8d
  00041	f7 d9		 neg	 ecx
  00043	41 33 c1	 xor	 eax, r9d
  00046	83 c9 bf	 or	 ecx, -65
  00049	41 f7 d8	 neg	 r8d
  0004c	f7 d8		 neg	 eax
  0004e	41 83 c8 10	 or	 r8d, 16
  00052	41 f7 d9	 neg	 r9d
  00055	41 23 c0	 and	 eax, r8d
  00058	41 83 c9 ef	 or	 r9d, -17
  0005c	4c 8d 05 00 00
	00 00		 lea	 r8, OFFSET FLAT:?rays@?1??inBetween
  00063	41 23 c1	 and	 eax, r9d
  00066	83 c8 cf	 or	 eax, -49
  00069	23 c1		 and	 eax, ecx
  0006b	42 8d 0c 12	 lea	 ecx, DWORD PTR [rdx+r10]
  0006f	f7 d0		 not	 eax
  00071	48 c1 e8 04	 shr	 rax, 4
  00075	49 8b 04 c0	 mov	 rax, QWORD PTR [r8+rax*8]
  00079	48 d3 e0	 shl	 rax, cl
  0007c	41 8b cb	 mov	 ecx, r11d
  0007f	2b ca		 sub	 ecx, edx
  00081	ba 01 00 00 00	 mov	 edx, 1
  00086	48 d3 e2	 shl	 rdx, cl
  00089	48 83 ea 01	 sub	 rdx, 1
  0008d	48 23 c2	 and	 rax, rdx
  00090	c3		 ret	 0
 
Code: Select all
sq1$ = 8
sq2$ = 16
?inBetween@@YA_KII@Z PROC
  00000	8b c2		 mov	 eax, edx
  00002	49 bb 80 40 20
	10 08 04 02 00	 mov	 r11, 0002040810204080H
  0000c	2b c1		 sub	 eax, ecx
  0000e	44 8b c0	 mov	 r8d, eax
  00011	41 c1 f8 1f	 sar	 r8d, 31
  00015	44 23 c0	 and	 r8d, eax
  00018	41 2b d0	 sub	 edx, r8d
  0001b	45 8d 14 08	 lea	 r10d, DWORD PTR [r8+rcx]
  0001f	8b ca		 mov	 ecx, edx
  00021	44 8b c2	 mov	 r8d, edx
  00024	83 e1 07	 and	 ecx, 7
  00027	41 83 c8 07	 or	 r8d, 7
  0002b	41 8b c2	 mov	 eax, r10d
  0002e	83 e0 07	 and	 eax, 7
  00031	45 2b c2	 sub	 r8d, r10d
  00034	2b c8		 sub	 ecx, eax
  00036	41 c1 e8 03	 shr	 r8d, 3
  0003a	42 8d 04 01	 lea	 eax, DWORD PTR [rcx+r8]
  0003e	83 e1 0f	 and	 ecx, 15
  00041	45 8b c8	 mov	 r9d, r8d
  00044	83 e0 0f	 and	 eax, 15
  00047	4c 33 c9	 xor	 r9, rcx
  0004a	48 83 c1 ff	 add	 rcx, -1
  0004e	48 83 e8 01	 sub	 rax, 1
  00052	49 83 e9 01	 sub	 r9, 1
  00056	49 23 c3	 and	 rax, r11
  00059	4d 23 cb	 and	 r9, r11
  0005c	49 0f c9	 bswap	 r9
  0005f	4c 0b c8	 or	 r9, rax
  00062	41 8d 40 ff	 lea	 eax, DWORD PTR [r8-1]
  00066	c1 e8 1a	 shr	 eax, 26
  00069	03 c0		 add	 eax, eax
  0006b	4c 0b c8	 or	 r9, rax
  0006e	48 b8 00 01 01
	01 01 01 01 00	 mov	 rax, 0001010101010100H
  00078	48 23 c1	 and	 rax, rcx
  0007b	41 8b ca	 mov	 ecx, r10d
  0007e	49 0b c1	 or	 rax, r9
  00081	48 d3 e0	 shl	 rax, cl
  00084	8b ca		 mov	 ecx, edx
  00086	ba 01 00 00 00	 mov	 edx, 1
  0008b	48 d3 e2	 shl	 rdx, cl
  0008e	48 83 ea 01	 sub	 rdx, 1
  00092	48 23 c2	 and	 rax, rdx
  00095	c3		 ret	 0
Cheers,
Gerd