Code: Select all
#include <emmintrin.h>
#define XMM_ALIGN __declspec(align(16))
#define C64(constantU64) constantU64##ULL
int popCount4max15(const Bitboard* bb) {
static const Bitboard XMM_ALIGN masks[4] = {
C64(0x5555555555555555),
C64(0x5555555555555555),
C64(0x3333333333333333),
C64(0x3333333333333333)
};
const __m128i* pM = (const __m128i*) masks;
const __m128i* pb = (const __m128i*) &bb;
__m128i v = pb[0];
__m128i w = pb[1];
v = _mm_sub_epi16(v, _mm_and_si128(_mm_srli_epi16(v, 1), pM[0]));
w = _mm_sub_epi16(w, _mm_and_si128(_mm_srli_epi16(w, 1), pM[0]));
v = _mm_add_epi16(_mm_and_si128(_mm_srli_epi16(v, 2), pM[1]), _mm_and_si128(v, pM[1]));
w = _mm_add_epi16(_mm_and_si128(_mm_srli_epi16(w, 2), pM[1]), _mm_and_si128(w, pM[1]));
v = _mm_sad_epu8 (v, _mm_setzero_si128 ()); // sum bytes 15..8:7..0
w = _mm_sad_epu8 (w, _mm_setzero_si128 ()); // sum bytes 15..8:7..0
return _mm_cvtsi128_si32(v)
+ (_mm_extract_epi16(v, 4) << 8)
+ (_mm_cvtsi128_si32(w) << 16)
+ (_mm_extract_epi16(w, 4) << 24);
}
void test()
{
Bitboard bb[4];
bb[0] = C64(0x0);
bb[1] = C64(0x1);
bb[2] = C64(0x3);
bb[3] = C64(0x7);
int res = popCount4max15(bb);
std::cout << res << std::endl;
}
But results are not correct. Also the produced assembly I am not sure is so much cheaper then the standard version:
Code: Select all
int popCount4max15(const Bitboard* bb) {
004100D0 push ebx
004100D1 mov ebx,esp
004100D3 sub esp,8
004100D6 and esp,0FFFFFFF0h
004100D9 add esp,4
004100DC push ebp
004100DD mov ebp,dword ptr [ebx+4]
004100E0 mov dword ptr [esp+4],ebp
004100E4 mov ebp,esp
004100E6 sub esp,150h
004100EC mov eax,dword ptr [___security_cookie (48E6DCh)]
004100F1 xor eax,ebp
004100F3 mov dword ptr [ebp-4],eax
static const Bitboard XMM_ALIGN masks[4] = {
C64(0x5555555555555555),
C64(0x5555555555555555),
C64(0x3333333333333333),
C64(0x3333333333333333)
};
const __m128i* pM = (const __m128i*) masks;
004100F6 mov dword ptr [ebp-8],offset masks (47B7E0h)
const __m128i* pb = (const __m128i*) &bb;
004100FD lea eax,[ebx+8]
00410100 mov dword ptr [ebp-0Ch],eax
__m128i v = pb[0];
00410103 mov ecx,dword ptr [ebp-0Ch]
00410106 movdqa xmm0,xmmword ptr [ecx]
0041010A movdqa xmmword ptr [ebp-20h],xmm0
__m128i w = pb[1];
0041010F mov edx,dword ptr [ebp-0Ch]
00410112 movdqa xmm0,xmmword ptr [edx+10h]
00410117 movdqa xmmword ptr [ebp-30h],xmm0
v = _mm_sub_epi16(v, _mm_and_si128(_mm_srli_epi16(v, 1), pM[0]));
0041011C movdqa xmm0,xmmword ptr [ebp-20h]
00410121 psrlw xmm0,1
00410126 movdqa xmmword ptr [ebp-40h],xmm0
0041012B mov eax,dword ptr [ebp-8]
0041012E movdqa xmm0,xmmword ptr [eax]
00410132 movdqa xmm1,xmmword ptr [ebp-40h]
00410137 pand xmm1,xmm0
0041013B movdqa xmmword ptr [ebp-50h],xmm1
00410140 movdqa xmm0,xmmword ptr [ebp-50h]
00410145 movdqa xmm1,xmmword ptr [ebp-20h]
0041014A psubw xmm1,xmm0
0041014E movdqa xmmword ptr [ebp-60h],xmm1
00410153 movdqa xmm0,xmmword ptr [ebp-60h]
00410158 movdqa xmmword ptr [ebp-20h],xmm0
w = _mm_sub_epi16(w, _mm_and_si128(_mm_srli_epi16(w, 1), pM[0]));
0041015D movdqa xmm0,xmmword ptr [ebp-30h]
00410162 psrlw xmm0,1
00410167 movdqa xmmword ptr [ebp-70h],xmm0
0041016C mov ecx,dword ptr [ebp-8]
0041016F movdqa xmm0,xmmword ptr [ecx]
00410173 movdqa xmm1,xmmword ptr [ebp-70h]
00410178 pand xmm1,xmm0
0041017C movdqa xmmword ptr [ebp-80h],xmm1
00410181 movdqa xmm0,xmmword ptr [ebp-80h]
00410186 movdqa xmm1,xmmword ptr [ebp-30h]
0041018B psubw xmm1,xmm0
0041018F movdqa xmmword ptr [ebp-90h],xmm1
00410197 movdqa xmm0,xmmword ptr [ebp-90h]
0041019F movdqa xmmword ptr [ebp-30h],xmm0
v = _mm_add_epi16(_mm_and_si128(_mm_srli_epi16(v, 2), pM[1]), _mm_and_si128(v, pM[1]));
004101A4 mov edx,dword ptr [ebp-8]
004101A7 movdqa xmm0,xmmword ptr [edx+10h]
004101AC movdqa xmm1,xmmword ptr [ebp-20h]
004101B1 pand xmm1,xmm0
004101B5 movdqa xmmword ptr [ebp-0A0h],xmm1
004101BD movdqa xmm0,xmmword ptr [ebp-20h]
004101C2 psrlw xmm0,2
004101C7 movdqa xmmword ptr [ebp-0B0h],xmm0
004101CF mov eax,dword ptr [ebp-8]
004101D2 movdqa xmm0,xmmword ptr [eax+10h]
004101D7 movdqa xmm1,xmmword ptr [ebp-0B0h]
004101DF pand xmm1,xmm0
004101E3 movdqa xmmword ptr [ebp-0C0h],xmm1
004101EB movdqa xmm0,xmmword ptr [ebp-0A0h]
004101F3 movdqa xmm1,xmmword ptr [ebp-0C0h]
004101FB paddw xmm1,xmm0
004101FF movdqa xmmword ptr [ebp-0D0h],xmm1
00410207 movdqa xmm0,xmmword ptr [ebp-0D0h]
0041020F movdqa xmmword ptr [ebp-20h],xmm0
w = _mm_add_epi16(_mm_and_si128(_mm_srli_epi16(w, 2), pM[1]), _mm_and_si128(w, pM[1]));
00410214 mov ecx,dword ptr [ebp-8]
00410217 movdqa xmm0,xmmword ptr [ecx+10h]
0041021C movdqa xmm1,xmmword ptr [ebp-30h]
00410221 pand xmm1,xmm0
00410225 movdqa xmmword ptr [ebp-0E0h],xmm1
0041022D movdqa xmm0,xmmword ptr [ebp-30h]
00410232 psrlw xmm0,2
00410237 movdqa xmmword ptr [ebp-0F0h],xmm0
0041023F mov edx,dword ptr [ebp-8]
00410242 movdqa xmm0,xmmword ptr [edx+10h]
00410247 movdqa xmm1,xmmword ptr [ebp-0F0h]
0041024F pand xmm1,xmm0
00410253 movdqa xmmword ptr [ebp-100h],xmm1
0041025B movdqa xmm0,xmmword ptr [ebp-0E0h]
00410263 movdqa xmm1,xmmword ptr [ebp-100h]
0041026B paddw xmm1,xmm0
0041026F movdqa xmmword ptr [ebp-110h],xmm1
00410277 movdqa xmm0,xmmword ptr [ebp-110h]
0041027F movdqa xmmword ptr [ebp-30h],xmm0
v = _mm_sad_epu8 (v, _mm_setzero_si128 ()); // sum bytes 15..8:7..0
00410284 pxor xmm0,xmm0
00410288 movdqa xmmword ptr [ebp-120h],xmm0
00410290 movdqa xmm0,xmmword ptr [ebp-120h]
00410298 movdqa xmm1,xmmword ptr [ebp-20h]
0041029D psadbw xmm1,xmm0
004102A1 movdqa xmmword ptr [ebp-130h],xmm1
004102A9 movdqa xmm0,xmmword ptr [ebp-130h]
004102B1 movdqa xmmword ptr [ebp-20h],xmm0
w = _mm_sad_epu8 (w, _mm_setzero_si128 ()); // sum bytes 15..8:7..0
004102B6 pxor xmm0,xmm0
004102BA movdqa xmmword ptr [ebp-140h],xmm0
004102C2 movdqa xmm0,xmmword ptr [ebp-140h]
004102CA movdqa xmm1,xmmword ptr [ebp-30h]
004102CF psadbw xmm1,xmm0
004102D3 movdqa xmmword ptr [ebp-150h],xmm1
004102DB movdqa xmm0,xmmword ptr [ebp-150h]
004102E3 movdqa xmmword ptr [ebp-30h],xmm0
return _mm_cvtsi128_si32(v)
+ (_mm_extract_epi16(v, 4) << 8)
+ (_mm_cvtsi128_si32(w) << 16)
+ (_mm_extract_epi16(w, 4) << 24);
004102E8 movdqa xmm0,xmmword ptr [ebp-20h]
004102ED movd eax,xmm0
004102F1 movdqa xmm0,xmmword ptr [ebp-20h]
004102F6 pextrw ecx,xmm0,4
004102FB shl ecx,8
004102FE add eax,ecx
00410300 movdqa xmm0,xmmword ptr [ebp-30h]
00410305 movd edx,xmm0
00410309 shl edx,10h
0041030C add eax,edx
0041030E movdqa xmm0,xmmword ptr [ebp-30h]
00410313 pextrw ecx,xmm0,4
00410318 shl ecx,18h
0041031B add eax,ecx
}
0041031D mov ecx,dword ptr [ebp-4]
00410320 xor ecx,ebp
00410322 call __security_check_cookie (460D30h)
00410327 mov esp,ebp
00410329 pop ebp
0041032A mov esp,ebx
0041032C pop ebx
0041032D ret