dangi12012 wrote: ↑Mon Jan 23, 2023 8:15 pm
So since you have AVX512 you maybe wanna take a closer look at this one - native code is in the comments I couldnt verify it. But your 13900k has avx512 + gfni extensions to be able to run it. 
 
I couldn't figure out the correct steps to get AVX512 instructions working, however we only need the 256 bit GFNI versions for Queens which seem to work:
Code: Select all
Verify Engines...OK!
13th Gen Intel(R) Core(TM) i9-13900K
Million Lookups/s Random Squares, Random Occupation/s:
Name                               Performance [MQueens/s]       Tablesize           Dependencies             Template  Author                                       Reference
SBAMG o^(o-3cbn)                   332.169816                    576     [4kb]       countl_zero, bswap       yes       Syed Fahad                                   http://www.talkchess.com/forum3/viewtopic.php?t=59845
SBAMG Inline                       215.952793                    0       [0kb]       countl_zero, bswap       yes       Syed Fahad and Daniel Inführ                 http://www.talkchess.com/forum3/viewtopic.php?t=59845
GaloisField - AVX512               773.161004                    0       [0kb]       AVX512F_GFNI             no        Daniel Inführ (dangi12012)                   http://www.talkchess.com/forum3/viewtopic.php?f=7&t=81335
Hyperbola Quintessence o^(o-2r)    335.816510                    256     [2kb]       bswap                    no        Ryan Mack                                    https://www.chessprogramming.org/Hyperbola_Quintessence
Hyperbola Quintessence Inline      107.513876                    0       [0kb]       bswap                    yes       Ryan Mack                                    https://www.chessprogramming.org/Hyperbola_Quintessence
Genetic 8 Ray                      58.763729                     0       [0kb]       bswap                    no        Daniel Inführ (dangi12012)                   Abstract C++ Syntax Tree Sifter (c) Daniel Infuehr
Bitrotation                        52.830621                     0       [0kb]       ReverseBits              no        Daniel Inführ (dangi12012)                   http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79078&start=20
Binary Neural Network              44.667767                     5852    [45kb]      pdep_u64, AVX2           no        Daniel Inführ (dangi12012)                   http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79332
Exploding Bitboards                75.366895                     768     [6kb]       imul64                   no        Harald Lüßen                                  http://www.open-aurec.com/wbforum/viewtopic.php?f=4&t=4523&start=80
Reference (Switch Lookup)          57.220132                     0       [0kb]       none                     yes       Daniel Inführ (dangi12012)                   http://www.talkchess.com/forum3/viewtopic.php?f=7&t=78235&p=907362&hilit=espresso#p907362
AVX Branchless Shift               247.297757                    0       [0kb]       AVX2                     no        Daniel Inführ (dangi12012)                   http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79005&start=60
Pext Emulated                      86.823393                     107904  [843kb]     none                     no        Zach Wegner                                  https://randombit.net/bitbashing/posts/haswell_bit_permutations.html
Dumb7 Fill                         90.561322                     0       [0kb]       none                     no        Gunnar Andersson                             https://www.chessprogramming.org/Dumb7Fill
Kogge-Stone                        143.259463                    0       [0kb]       none                     no        Peter M. Kogge, Harold S. Stone              https://www.chessprogramming.org/Kogge-Stone_Algorithm
Rotated Bitboards                  48.882147                     1848    [14kb]      none                     no        Robert Hyatt                                 https://www.chessprogramming.org/Rotated_Bitboards
QBBEngine                          245.401282                    0       [0kb]       countr_zero, countl_zero yes       Fabio Gobbato                                https://www.chessprogramming.org/QBBEngine
QBBEngine - Shifted Mask           248.537255                    0       [0kb]       countr_zero, countl_zero no        Fabio Gobbato                                http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79005&start=90#p924623
Classical Bob-Mike                 317.352015                    1024    [8kb]       countr_zero, countl_zero yes       Robert Hyatt and Michael Sherwin             https://www.chessprogramming.org/Classical_Approach
Advanced Bob-Mike                  360.594259                    520     [4kb]       countr_zero, countl_zero no        Michael Sherwin and Daniel Inführ            http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79078&start=50#p924653
Leorik                             317.960811                    128     [1kb]       countl_zero              no        Thomas Jahn (lithander)                      https://github.com/lithander/MinimalChessEngine
Leorik Inline                      125.383071                    0       [0kb]       countl_zero              no        Thomas Jahn (lithander)                      https://github.com/lithander/MinimalChessEngine
Obstruction Difference             348.109620                    768     [6kb]       countl_zero              no        Michael Hoffmann                             http://www.talkchess.com/forum3/viewtopic.php?t=29087
Obstruction Difference Inline      111.217179                    0       [0kb]       countl_zero              yes       Michael Hoffmann                             http://www.talkchess.com/forum3/viewtopic.php?t=29087
Genetic Obstruction Difference     335.078534                    384     [3kb]       countl_zero              no        Daniel Inführ and Michael Hoffmann           http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79701
Genetic Obstruction Difference V2  388.473977                    768     [6kb]       countl_zero              no        Daniel Inführ                                http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79701
Slide Arithmetic                   317.034250                    256     [2kb]       bzhi_u64, blsmsk_u64     no        Jakob Progsch and Daniel Inführ              http://www.talkchess.com/forum3/viewtopic.php?f=7&t=78693&p=914767&hilit=SlideArithm#p914767
Slide Arithmetic Inline            120.568965                    0       [0kb]       bzhi_u64, blsmsk_u64     no        Jakob Progsch and Daniel Inführ              http://www.talkchess.com/forum3/viewtopic.php?f=7&t=78693&p=914767&hilit=SlideArithm#p914767
Kindergarten                       644.520235                    16640   [130kb]     imul64                   no        Urban Koistinen                              https://www.chessprogramming.org/Kindergarten_Bitboards
SISSY Bitboards                    460.233876                    180416  [1409kb]    none                     no        Michael Sherwin                              http://www.talkchess.com/forum3/viewtopic.php?f=7&t=73083
Fancy Magic BB - Variable shift    669.429925                    93376   [729kb]     imul64                   yes       Pradu Kannan                                 https://www.chessprogramming.org/Magic_Bitboards#Fancy
FoldingHash - 4x fancy magic       313.984887                    6468    [50kb]      none                     no        Daniel Inführ                                tbd
Plain Magic BB                     729.709515                    295168  [2306kb]    imul64                   no        Lasse Hansen                                 https://www.chessprogramming.org/Magic_Bitboards#Plain
Black Magic BB - Fixed shift       911.632430                    88891   [694kb]     imul64                   no        Onno Garms and Volker Annuss                 https://www.chessprogramming.org/Magic_Bitboards#Fixed_shift_Fancy
Pext constexpr                     1392.418282                   107904  [843kb]     pext_u64                 yes       Zach Wegner                                  https://www.chessprogramming.org/BMI2#PEXTBitboards
HyperCube                          64.780195                     107680  [841kb]     none                     yes       Daniel Inführ (dangi12012)                   http://www.talkchess.com/forum3/viewtopic.php?f=7&t=79004&p=916723&hilit=hypercube#p916723
Here is the updated code:
Code: Select all
namespace Chess_Lookup::GaloisField
{
	constexpr auto Size = 0;
	template<uint64_t bb>
	constexpr uint64_t mask_shift(int ranks) {
		return ranks > 0 ? bb >> (ranks << 3) : bb << -(ranks << 3);
	}
#	define dir_HO(X) (0xFFull << (X & 56))
#	define dir_VE(X) (0x0101010101010101ull << (X & 7))
#	define dir_D1(X) (mask_shift<0x8040201008040201ull>((X & 7) - (X >> 3)))
#	define dir_D2(X) (mask_shift<0x0102040810204080ull>(7 - (X & 7) - (X >> 3)))
	static __m256i* boardMask = new __m256i[64];
	static void InitMask() {
		for (int square = 0; square < 64; ++square) {
			boardMask[square] = _mm256_set_epi64x(dir_HO(square) ^ (1ull << square), dir_VE(square) ^ (1ull << square), dir_D1(square) ^ (1ull << square), dir_D2(square) ^ (1ull << square));
		}
	}
	//Reverses bits in all 64 bytes at once 
	static __m256i bit_reverse(__m256i input) {
		
		__m256i b = _mm256_gf2p8affine_epi64_epi8(input, _mm256_set1_epi64x(0x8040201008040201), 0x00);
		const __m256i shuffle_mask = _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23);
		return _mm256_shuffle_epi8(b, shuffle_mask);
	}
	//This can solve 8 rays, so all moves of two queens at once or 4 (rooks, bishops)
	static __m256i attack8(uint64_t occ, int square, __m256i mask) {
		__m256i o = _mm256_and_epi32(_mm256_set1_epi64x(occ), mask);
		__m256i sq = _mm256_set1_epi64x((1ull << square));
		__m256i sqRev = _mm256_set1_epi64x((0x8000000000000000ull >> square));
		return _mm256_and_epi32(_mm256_xor_epi32(_mm256_sub_epi64(o, sq), bit_reverse(_mm256_sub_epi64(bit_reverse(o), sqRev))), mask);
	}
	static uint64_t Queen(int sq, uint64_t occ) {
		__m256i result = attack8(occ, sq, _mm256_loadu_si256(boardMask + sq));
		__m256i result2 = _mm256_or_epi32(result, _mm256_permute4x64_epi64(result, 0x4E));
		return _mm256_or_epi32(result2, _mm256_permute4x64_epi64(result2, 0x4D)).m256i_u64[0];
	}
#undef dir_HO
#undef dir_VE
#undef dir_D1
#undef dir_D2
}
and corresponding assembly:
Code: Select all
00007FF7D8BF7C60  movzx       edx,byte ptr [r8]  
00007FF7D8BF7C64  movsxd      rax,edx  
00007FF7D8BF7C67  shl         rax,5  
00007FF7D8BF7C6B  vmovdqu     ymm4,ymmword ptr [rax+r13]  
00007FF7D8BF7C71  vpand       ymm3,ymm7,ymm4  
00007FF7D8BF7C75  vgf2p8affineqb ymm0,ymm3,ymm8,0  
00007FF7D8BF7C7B  vpshufb     ymm1,ymm0,ymm9  
00007FF7D8BF7C80  shrx        rcx,r14,rdx  
00007FF7D8BF7C85  vmovq       xmm0,rcx  
00007FF7D8BF7C8A  vpbroadcastq ymm0,xmm0  
00007FF7D8BF7C8F  vpsubq      ymm0,ymm1,ymm0  
00007FF7D8BF7C93  vgf2p8affineqb ymm1,ymm0,ymm8,0  
00007FF7D8BF7C99  vpshufb     ymm2,ymm1,ymm9  
00007FF7D8BF7C9E  shlx        rax,r12,rdx  
00007FF7D8BF7CA3  vmovq       xmm0,rax  
00007FF7D8BF7CA8  vpbroadcastq ymm0,xmm0  
00007FF7D8BF7CAD  vpsubq      ymm0,ymm3,ymm0  
00007FF7D8BF7CB1  vpxor       ymm1,ymm0,ymm2  
00007FF7D8BF7CB5  vpand       ymm2,ymm1,ymm4  
00007FF7D8BF7CB9  vpermq      ymm0,ymm2,4Eh  
00007FF7D8BF7CBF  vpor        ymm3,ymm2,ymm0  
00007FF7D8BF7CC3  vpermq      ymm1,ymm3,4Dh  
00007FF7D8BF7CC9  vpor        ymm0,ymm3,ymm1  
00007FF7D8BF7CCD  vmovq       rax,xmm0  
00007FF7D8BF7CD2  xor         rdi,rax