scan-cut slider attack generation

mar · Post by **mar** » Thu Feb 13, 2020 8:23 pm

Michael inspired me to try something else as well, I'm pretty sure I just reinvented something else as this is the very obvious thing to try.
Main idea is to mask ray with occupancy and use bitscan to find lsbit/msbit, then use this to build cut mask to remove blocked parts.
Two variants (one with 4 bitscans per slider, another using lsbit isolation trick to build the cut mask).
It seems to work, but I haven't done any rigorous tests. Most likely slower than fancy magics, but it only needs a 4k table
and is super-easy to setup.

Full pseudo-C++ code follows, ulong = uint64_t, ul suffix ditto. bsfl/bsfr are 64-bit bitscans corresponding to x86/x64 instructions.
The code assumes that A1 maps to bit 0 and H8 to 63, but it should be easy adapt to other bitboard layouts (adjust shifts, reorder masks that use bsf vs bsr).

Code: Select all

//-----------------------------------------------------------------------------
ulong line_attacks(ulong occ, const ulong mask[4])
{
	ulong res;

/*	// variant 1 (with bsf)
	auto bit0 = bsfl(occ & mask[0] | (1ul << 63));
	res  = mask[0] & ((2ul << bit0)-1);
	auto bit1 = bsfl(occ & mask[1] | (1ul << 63));
	res |= mask[1] & ((2ul << bit1)-1);
*/

	// variant 2 (without bsf)
	// isolate lsbit to build cut mask
	auto lsbit0 = occ & mask[0];
	lsbit0 &= -lsbit0;
	res  = mask[0] & (lsbit0+lsbit0-1);
	auto lsbit1 = occ & mask[1];
	lsbit1 &= -lsbit1;
	res |= mask[1] & (lsbit1+lsbit1-1);

	auto bit2 = bsrl(occ & mask[2] | 1);
	res |= mask[2] & ~((1ul << bit2)-1);
	auto bit3 = bsrl(occ & mask[3] | 1);
	res |= mask[3] & ~((1ul << bit3)-1);
	return res;
}

//-----------------------------------------------------------------------------
inline ulong shift_up(ulong bb)
{
	return bb << 8;
}

//-----------------------------------------------------------------------------
inline ulong shift_down(ulong bb)
{
	return bb >> 8;
}

//-----------------------------------------------------------------------------
inline ulong shift_left(ulong bb)
{
	bb >>= 1;
	bb &= 0x7f7f7f7f7f7f7f7ful;
	return bb;
}

//-----------------------------------------------------------------------------
inline ulong shift_right(ulong bb)
{
	bb <<= 1;
	bb &= 0xfefefefefefefefeul;
	return bb;
}

// 4kb table, cache aligned
struct alignas(64) at_table_t
{
	// diagonal masks
	ulong dmask[4];
	// orthogonal masks
	ulong omask[4];
}

// attack table
at_table_t at_table[64];

//-----------------------------------------------------------------------------
void compute_pseudo_diag(int sq)
{
	ulong bb = 1ul << sq;
	ulong bb0 = bb;
	ulong bb1 = bb;
	ulong bb2 = bb;
	ulong bb3 = bb;

	for (int i=0; i<7; i++)
	{
		bb0 |= shift_left(shift_up(bb0));
		bb1 |= shift_right(shift_up(bb1));
		bb2 |= shift_left(shift_down(bb2));
		bb3 |= shift_right(shift_down(bb3));
	}

	// bsf
	at_table[sq].dmask[0] = bb0 & ~bb;
	// bsf
	at_table[sq].dmask[1] = bb1 & ~bb;
	// bsr
	at_table[sq].dmask[2] = bb2 & ~bb;
	// bsr
	at_table[sq].dmask[3] = bb3 & ~bb;
}

//-----------------------------------------------------------------------------
void compute_pseudo_ortho(int sq)
{
	ulong bb = 1ul << sq;
	ulong bb0 = bb;
	ulong bb1 = bb;
	ulong bb2 = bb;
	ulong bb3 = bb;

	for (int i=0; i<7; i++)
	{
		bb0 |= shift_right(bb0);
		bb1 |= shift_up(bb1);
		bb2 |= shift_left(bb2);
		bb3 |= shift_down(bb3);
	}

	// bsf
	at_table[sq].omask[0] = bb0 & ~bb;
	// bsf
	at_table[sq].omask[1] = bb1 & ~bb;
	// bsr
	at_table[sq].omask[2] = bb2 & ~bb;
	// bsr
	at_table[sq].omask[3] = bb3 & ~bb;
}

//-----------------------------------------------------------------------------
void compute_tables()
{
	// compute pseudo-attack masks first
	for (int sq=0; sq<64; sq++)
	{
		compute_pseudo_diag(sq);
		compute_pseudo_ortho(sq);
	}
}

//-----------------------------------------------------------------------------
inline ulong diag_attacks(int sq, ulong occ)
{
	return line_attacks(occ, at_table[sq].dmask);
}

//-----------------------------------------------------------------------------
inline ulong ortho_attacks(int sq, ulong occ)
{
	return line_attacks(occ, at_table[sq].omask);
}

mar · Post by **mar** » Thu Feb 13, 2020 9:16 pm

A more compact version, but not sure if it's more readable than the above due to use of constant tables to define the shifts

Code: Select all

//-----------------------------------------------------------------------------
ulong line_attacks(ulong occ, const ulong mask[4])
{
	ulong res;

/*	// variant 1 (with bsf)
	auto bit0 = bsfl(occ & mask[0] | (1ul << 63));
	res  = mask[0] & ((2ul << bit0)-1);
	auto bit1 = bsfl(occ & mask[1] | (1ul << 63));
	res |= mask[1] & ((2ul << bit1)-1);
*/

	// variant 2 (without bsf)
	// isolate lsbit to build cut mask
	auto lsbit0 = occ & mask[0];
	lsbit0 &= -lsbit0;
	res  = mask[0] & (lsbit0+lsbit0-1);
	auto lsbit1 = occ & mask[1];
	lsbit1 &= -lsbit1;
	res |= mask[1] & (lsbit1+lsbit1-1);

	auto bit2 = bsrl(occ & mask[2] | 1);
	res |= mask[2] & ~((1ul << bit2)-1);
	auto bit3 = bsrl(occ & mask[3] | 1);
	res |= mask[3] & ~((1ul << bit3)-1);
	return res;
}

//-----------------------------------------------------------------------------
inline ulong shift_up(ulong bb)
{
	return bb << 8;
}

//-----------------------------------------------------------------------------
inline ulong shift_down(ulong bb)
{
	return bb >> 8;
}

//-----------------------------------------------------------------------------
inline ulong shift_left(ulong bb)
{
	bb >>= 1;
	bb &= 0x7f7f7f7f7f7f7f7ful;
	return bb;
}

//-----------------------------------------------------------------------------
inline ulong shift_right(ulong bb)
{
	bb <<= 1;
	bb &= 0xfefefefefefefefeul;
	return bb;
}

//-----------------------------------------------------------------------------
inline ulong shift(ulong bb, int xdir, int ydir)
{
	if (xdir > 0)
		bb = shift_left(bb);
	else if (xdir < 0)
		bb = shift_right(bb);

	if (ydir > 0)
		bb = shift_up(bb);
	else if (ydir < 0)
		bb = shift_down(bb);

	return bb;
}

// 4kb table, cache aligned
struct alignas(64) at_table_t
{
	// diagonal masks
	ulong dmask[4];
	// orthogonal masks
	ulong omask[4];
}

// attack table
at_table_t at_table[64];

//-----------------------------------------------------------------------------
void compute_pseudo(int sq, const int dir[8], ulong mask[4])
{
	auto bb = 1ul << sq;
	auto bb0 = bb;
	auto bb1 = bb;
	auto bb2 = bb;
	auto bb3 = bb;

	for (int i=0; i<7; i++)
	{
		bb0 |= shift(bb0, dir[0], dir[1]);
		bb1 |= shift(bb1, dir[2], dir[3]);
		bb2 |= shift(bb2, dir[4], dir[5]);
		bb3 |= shift(bb3, dir[6], dir[7]);
	}

	// bsf
	mask[0] = bb0 & ~bb;
	// bsf
	mask[1] = bb1 & ~bb;
	// bsr
	mask[2] = bb2 & ~bb;
	// bsr
	mask[3] = bb3 & ~bb;
}

//-----------------------------------------------------------------------------
void compute_tables()
{
	const int diag_dir [8] = { 1, 1, -1, 1, 1, -1, -1, -1};
	const int ortho_dir[8] = {-1, 0,  0, 1, 1,  0,  0, -1};

	// compute pseudo-attack masks
	for (int sq=0; sq<64; sq++)
	{
		auto &table = at_table[sq];
		compute_pseudo(sq, diag_dir,  table.dmask);
		compute_pseudo(sq, ortho_dir, table.omask);
	}
}

//-----------------------------------------------------------------------------
inline ulong diag_attacks(int sq, ulong occ)
{
	return line_attacks(occ, at_table[sq].dmask);
}

//-----------------------------------------------------------------------------
inline ulong ortho_attacks(int sq, ulong occ)
{
	return line_attacks(occ, at_table[sq].omask);
}

Gerd Isenberg · Post by **Gerd Isenberg** » Thu Feb 13, 2020 9:51 pm

Reminds me on Bitfoot's A/B Bitboards...

https://www.chessprogramming.org/Bitfoot#ABBitboards

mar · Post by **mar** » Thu Feb 13, 2020 10:58 pm

Yes that's basically the same idea

Bitfoot A/B is it then.

I've just realized that the last masking can be done using >> then << or a table lookup.
It's a pity that there's no trick to isolate MSBit btw.

I'll toy around it a bit to see if it can be improved, currently fighting with clang (or llvm) being unable to optimize 63^__builtin_clzl into bsr (it does bsr but two extra useless xors, gcc on the other hand doesn't vectorize using SSE but at least gets the intrinsic right)

Michael Sherwin · Post by **Michael Sherwin** » Thu Feb 13, 2020 11:27 pm

Michael inspired me to try something else as well

I always knew I was good for something.

mar · Post by **mar** » Thu Feb 13, 2020 11:29 pm

mar wrote: ↑Thu Feb 13, 2020 10:58 pm It's a pity that there's no trick to isolate MSBit btw.

hmm, maybe I could compute a bit-reversed occupancy mask (once per position) and output two attack bitboards, one normal and one bit reversed
then there would be two iterations over two attack mask and the bit reversed one would simply flip the bit index during movegen phase (or simply index directly using lzcnt?)
but it seems a bit complicated with the real cost and complexity moved elsewhere... not sure I like it

@Michael: absolutely, I enjoy yours posts here

mar · Post by **mar** » Fri Feb 14, 2020 11:39 am

the good is that it works out of the box for both mappings (A1..H8 and A8..H1), the bad is the performance (as expected)
I tried this in cheng and got this (total slowdown, full search):

Code: Select all

9.4% slower vs fancy with gcc, total
7.6% slower vs fancy with msc, total

even if it's less than 10 elo, I've seen chessprogrammers do crazy things for less
this means that it's not terrible but still much slower than fancy magic

so out of the obscure slider attack methods, perhaps only Hyperbola Quintessence is competitive (might be worth trying)

here's the final C++ version with a table lookup:

Code: Select all

#include <stdint.h>

#ifdef _MSC_VER
#	include <intrin.h>
#endif

using ulong = uint64_t;
using slong = int64_t;

//-----------------------------------------------------------------------------
inline ulong bsrl(ulong m)
{
#ifdef _MSC_VER
	unsigned long index;
	_BitScanReverse64(&index, m);
	return (ulong)index;
#else
    return 63u ^ __builtin_clzll(m);
#endif
}

//-----------------------------------------------------------------------------
inline ulong shift_up(ulong bb)
{
	return bb << 8;
}

//-----------------------------------------------------------------------------
inline ulong shift_down(ulong bb)
{
	return bb >> 8;
}

//-----------------------------------------------------------------------------
inline ulong shift_left(ulong bb)
{
	bb >>= 1;
	bb &= 0x7f7f7f7f7f7f7f7full;
	return bb;
}

//-----------------------------------------------------------------------------
inline ulong shift_right(ulong bb)
{
	bb <<= 1;
	bb &= 0xfefefefefefefefeull;
	return bb;
}

//-----------------------------------------------------------------------------
inline ulong shift(ulong bb, int xdir, int ydir)
{
	if (xdir > 0)
		bb = shift_left(bb);
	else if (xdir < 0)
		bb = shift_right(bb);

	if (ydir > 0)
		bb = shift_up(bb);
	else if (ydir < 0)
		bb = shift_down(bb);

	return bb;
}

// 4kb table, cache aligned
struct alignas(64) at_table_t
{
	// diagonal masks
	ulong dmask[4];
	// orthogonal masks
	ulong omask[4];
};

// attack table (4k)
at_table_t at_table[64];
// ~((1ull << sq)-1)
// 0.5k
ulong cut_upper_mask[64];
// => total 4.5k

//-----------------------------------------------------------------------------
ulong line_attacks(ulong occ, const ulong mask[4])
{
	ulong res;

    auto m0 = mask[0];
    auto m1 = mask[1];
    auto m2 = mask[2];
    auto m3 = mask[3];

	// isolate lsbit to build cut mask
	auto lsbit0 = occ & m0;
	lsbit0 &= -(slong)lsbit0;
	res  = m0 & (lsbit0+lsbit0-1);

	auto lsbit1 = occ & m1;
	lsbit1 &= -(slong)lsbit1;
	res |= m1 & (lsbit1+lsbit1-1);

    auto bit2 = bsrl((occ & m2) | 1);
    auto bit3 = bsrl((occ & m3) | 1);
    res |= m2 & cut_upper_mask[bit2];
    res |= m3 & cut_upper_mask[bit3];

	return res;
}

//-----------------------------------------------------------------------------
void compute_pseudo(int sq, const int dir[8], ulong mask[4])
{
	auto bb = 1ull << sq;
	auto bb0 = bb;
	auto bb1 = bb;
	auto bb2 = bb;
	auto bb3 = bb;

	for (int i=0; i<7; i++)
	{
		bb0 |= shift(bb0, dir[0], dir[1]);
		bb1 |= shift(bb1, dir[2], dir[3]);
		bb2 |= shift(bb2, dir[4], dir[5]);
		bb3 |= shift(bb3, dir[6], dir[7]);
	}

	// bsf
	mask[0] = bb0 & ~bb;
	// bsf
	mask[1] = bb1 & ~bb;
	// bsr
	mask[2] = bb2 & ~bb;
	// bsr
	mask[3] = bb3 & ~bb;
}

//-----------------------------------------------------------------------------
void compute_tables()
{
	const int diag_dir[8] = {1, 1, -1, 1, 1, -1, -1, -1};
	const int ortho_dir[8] = {-1, 0, 0, 1, 1, 0, 0, -1};

	// compute pseudo-attack masks
	for (int sq=0; sq<64; sq++)
	{
        cut_upper_mask[sq] = ~((1ull << sq)-1);
		auto &table = at_table[sq];
		compute_pseudo(sq, diag_dir,  table.dmask);
		compute_pseudo(sq, ortho_dir, table.omask);
	}
}

//-----------------------------------------------------------------------------
inline ulong diag_attacks(int sq, ulong occ)
{
	return line_attacks(occ, at_table[sq].dmask);
}

//-----------------------------------------------------------------------------
inline ulong ortho_attacks(int sq, ulong occ)
{
	return line_attacks(occ, at_table[sq].omask);
}

int main()
{
    compute_tables();

	return 0;
}

scan-cut slider attack generation

scan-cut slider attack generation

Re: scan-cut slider attack generation

Re: scan-cut slider attack generation

Re: scan-cut slider attack generation

Re: scan-cut slider attack generation

Re: scan-cut slider attack generation

Re: scan-cut slider attack generation