Page 1 of 4

Back To The Beginning

Posted: Sat Sep 07, 2019 8:29 am
by Michael Sherwin
The first move generator I ever wrote was in a combination of C and 32 bit assembler. It was a GNUChess 4 style generator with improvements. I had to learn C and 32 bit assembler as it was written. Besides being in assembler one of the improvements was that the next square and next direction tables are compressed to save memory which was important at the time. It is now working in my current project.

Here is the initialization code. Don't expect much in the way of style as it was the first C code I ever wrote. And it works! Also I'll show both some C code and assembler code for how to use it.

Code: Select all

u08 bns[1808]; // bishop next square
u08 bnd[1808]; // bishop next direction
u08 rns[3704];
u08 rnd[3704];
u08 qns[3872];
u08 qnd[3872];
u08 nns[890];
u08 kns[809];
u08 wpns[360];
u08 bpns[360];

u32 bof[64]; // bishop offsets
u32 rof[64];
u32 qof[64];
u32 nof[64];
u32 kof[64];
u32 pof[64];

void InitMovGen() {
  s08 lbrd[120] =
  { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
    -1, 0, 1, 2, 3, 4, 5, 6, 7,-1,
    -1, 8, 9,10,11,12,13,14,15,-1,
    -1,16,17,18,19,20,21,22,23,-1,
    -1,24,25,26,27,28,29,30,31,-1,
    -1,32,33,34,35,36,37,38,39,-1,
    -1,40,41,42,43,44,45,46,47,-1,
    -1,48,49,50,51,52,53,54,55,-1,
    -1,56,57,58,59,60,61,62,63,-1,
    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1 };

  s08 pdir[7][8] =
  { {9,11,-9,-11,0,0,0,0},
   {1,10,-1,-10,0,0,0,0},
   {9,11,1,10,-9,-11,-1,-10},
   {8,12,19,21,-8,-12,-19,-21},
   {9,11,1,10,-9,-11,-1,-10},
   {10,9,11,0,0,0,0,0},
   {-10,-9,-11,0,0,0,0,0} };

  u08 maxs[7] =
  { 6,6,6,0,0,0,0 };

  u08 maxdir[7] =
  { 3,3,7,7,7,2,2 };

  s08 min = 64, max = 0;
  s08 ptyp;
  s08 plfs, lds, lts;
  s08 fs, ts, nds, pfs;
  s08 dir, ndir, ppdir, s;
  u08 * sbns = bns, * sbnd = bnd;
  u32 * pop = bof;
  s32 offset = 0;

  for (ptyp = 0; ptyp < 6; ptyp++)
  {
    switch (ptyp)
    {
    case 0:
      pop = bof;
      break;
    case 1:
      pop = rof;
      break;
    case 2:
      pop = qof;
      break;
    case 3:
      pop = nof;
      break;
    case 4:
      pop = kof;
      break;
    case 5:
      pop = pof;
      offset = 8;
    }
    for (plfs = 21; plfs < 99; plfs++)
      if ((pfs = fs = lbrd[plfs]) >= 0)
      {
        if ((ptyp > 0 && ptyp < 5) || pfs % 2 == 0) min = max = fs;
        for (dir = maxdir[ptyp]; dir >= 0; dir--)
          if ((ts = lbrd[(lts = plfs + pdir[ptyp][dir])]) >= 0)
          {
            for (s = maxs[ptyp]; s >= 0; s--)
            {
              if (ts < min)min = ts;
              if (ts > max)max = ts;
              if ((ts = lbrd[(lts = lts + pdir[ptyp][dir])]) < 0 || s == 0)
                break;
            }
          }
        if (ptyp != 0 && ptyp < 5)
        {
          if (ptyp < 3)
          {
            *(pop + pfs) = (offset - min);
            offset = offset + max - min + 1;
          }
          else
          {
            *(pop + pfs) = (offset - pfs);
            if (ptyp == 3)offset = offset + 13;
            else offset = offset + 11;
          }
        }
        else
        {
          if (pfs % 2 != 0)
          {
            *(pop + pfs) = (offset - min);
            *(pop + pfs - 1) = (offset - min);
            offset = offset + max - min + 2;
          }
        }
      }
    offset = 0;
  }

  for (ptyp = 0; ptyp < 7; ptyp++)
    for (plfs = 21; plfs < 99; plfs++)
      if ((pfs = fs = lbrd[plfs]) >= 0)
      {
        switch (ptyp)
        {
        case 0:
          sbns = bns + bof[pfs];
          sbnd = bnd + bof[pfs];
          break;
        case 1:
          sbns = rns + rof[pfs];
          sbnd = rnd + rof[pfs];
          break;
        case 2:
          sbns = qns + qof[pfs];
          sbnd = qnd + qof[pfs];
          break;
        case 3:
          sbns = nns + nof[pfs];
          break;
        case 4:
          sbns = kns + kof[pfs];
          break;
        case 5:
          sbns = wpns + pof[pfs];
          break;
        case 6:
          sbns = bpns + pof[pfs];
        }
        for (dir = maxdir[ptyp]; dir >= 0; dir--)
          if ((ts = lbrd[(lts = plfs + pdir[ptyp][dir])]) >= 0)
          {
            for (ndir = dir - 1; ndir >= 0; ndir--)
              if ((nds = lbrd[(lds = plfs + pdir[ptyp][ndir])]) >= 0)break;
            ppdir = 0;
            for (s = maxs[ptyp]; s >= 0; s--)
            {
              if (ptyp < 5)* (sbns + fs) = ts;
              if (ptyp == 5)
              {
                if (dir == 2 && ndir == 1)
                {
                  *(sbns + fs) = ts;
                  *(sbns + ts) = nds;
                  ppdir = 2;
                }
                if ((dir == 2 && ndir == 0) || (dir == 1))
                {
                  *(sbns + fs) = ts;
                  *(sbns + ts) = 65; if (pfs > 7 && pfs < 16)* (sbns + ts) = 66;
                  if (pfs > 31 && pfs < 40)* (sbns + ts) = 67;
                  if (pfs > 47 && pfs < 56)* (sbns + ts) = 68;
                  ppdir = 0;
                }
                if (dir == 1 && ppdir == 2)
                {
                  *(sbns + ts) = 65; if (pfs > 7 && pfs < 16)* (sbns + ts) = 66;
                  if (pfs > 31 && pfs < 40)* (sbns + ts) = 67;
                  if (pfs > 47 && pfs < 56)* (sbns + ts) = 68;
                }
              }
              if (ptyp == 6)
              {
                if (dir == 2 && ndir == 1)
                {
                  *(sbns + fs) = ts;
                  *(sbns + ts) = nds;
                  ppdir = 2;
                }
                if ((dir == 2 && ndir == 0) || (dir == 1))
                {
                  *(sbns + fs) = ts;
                  *(sbns + ts) = 65; if (pfs < 56 && pfs>47)* (sbns + ts) = 66;
                  if (pfs < 32 && pfs>23)* (sbns + ts) = 67;
                  if (pfs < 16 && pfs> 7)* (sbns + ts) = 68;
                  ppdir = 0;
                }
                if (dir == 1 && ppdir == 2)
                {
                  *(sbns + ts) = 65; if (pfs < 56 && pfs>47)* (sbns + ts) = 66;
                  if (pfs < 32 && pfs>23)* (sbns + ts) = 67;
                  if (pfs < 16 && pfs> 7)* (sbns + ts) = 68;
                }
              }
              if (ndir >= 0 && ptyp < 3)* (sbnd + ts) = nds;
              else if (ptyp < 3)* (sbnd + ts) = 64;
              fs = ts;
              if ((ts = lbrd[(lts = lts + pdir[ptyp][dir])]) < 0 || s == 0)
              {
                if (ndir >= 0 && ptyp < 5)
                {
                  *(sbns + fs) = nds;
                  if (ptyp < 3)* (sbnd + fs) = nds;
                }
                else
                  if (ptyp < 5)
                  {
                    *(sbns + fs) = 64;
                    if (ptyp < 3)* (sbnd + fs) = 64;
                  }
                break;
              }
            }
          }
      }

}

Code: Select all

case BISHOP:
             sbns=bns+bof[fs];
             sbnd=bnd+bof[fs];
             ts=*(sbns+fs);
             while(ts<64)
             {
               switch(wtrgt[brd[ts]])
               {
                 case VACANT:
                        link(MOVE);
                        ts=*(sbns+ts);
                        continue;
                 case FRIEND:             
                        ts=*(sbnd+ts);
                        continue;
                 case ENEMY:
                        link(CAPTURE);
                        ts=*(sbnd+ts);
                        continue;
                 default:
                        return FALSE;
               }                            
             }
             continue;

Code: Select all

.data

; Bishop Jump Table
; wbmf = white bishop move function
; wbmnd = white bishop move next direction
; wbmrm = white bishop move record move
; wbmrc =  white bishop move record capture
; wmcbki = white move captures black king illegally
; wnxtm = white next move
wbmf            dd          0
                dd          wbmnd,wbmnd,wbmnd,wbmnd,wbmnd,wbmnd,wbmnd,wbmnd
                dd          wbmnd,wbmnd,wbmnd,wbmnd,wbmnd,wbmnd,wbmnd,wbmnd
                dd          0,0,0
                dd          wbmrm
                dd          0
                dd          wbmrc,wbmrc,wbmrc,wbmrc,wbmrc,wbmrc,wbmrc,wbmrc
                dd          wbmrc,wbmrc,wbmrc,wbmrc,wbmrc,wbmrc,wbmrc,wmcbki
                dd          wnxtm
            
 .code ;32 bit code
  
  ; white move generator          
 wmg:
            push        ebp ; this gets the first piece in a list of linked pieces and jumps to the piece type                   
            push        edi
            push        esi
            mov         ebp,[ply]
            mov         eax,[first+ebp*4]
            mov         [lis+ebp*4],eax
            mov         edi,[nxt]
            jmp         [ptmf+edi*4]  
          
 ; white bishop move (gen)           
 wbm:  
            mov         ecx,[ps+edi*4] 
            mov         esi,[bol+ecx*4]
            movsx       ebx,[bns+esi+ecx]
            mov         edx,[brd+ebx*4]
            jmp         [wbmf+edx*4]
        
wbmrm:
             mov         [tree.fsq+eax*8],cl
             mov         [tree.tsq+eax*8],bl
             mov         [tree.typ+eax*8],BMOV
             inc         eax
             movsx       ebx,[bns+esi+ebx]
             mov         edx,[brd+ebx*4]
             jmp         [wbmf+edx*4]

wbmrc:
            mov         [tree.fsq+eax*8],cl
            mov         [tree.tsq+eax*8],dl
            mov         [tree.typ+eax*8],BCAP
            inc         eax
wbmnd:
             movsx       ebx,[bnd+esi+ebx]
             mov         edx,[brd+ebx*4]
             jmp         [wbmf+edx*4]                    

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 9:10 am
by Ras
I'd not use assembly because porting that needs separate code paths for each CPU architecture. E.g. how would you make an Android version? It's also not particularly useful because the move generator doesn't take a significant amount of time anyway.

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 9:31 am
by Michael Sherwin
Ras wrote:
Sat Sep 07, 2019 9:10 am
I'd not use assembly because porting that needs separate code paths for each CPU architecture. E.g. how would you make an Android version? It's also not particularly useful because the move generator doesn't take a significant amount of time anyway.
Depends what it is being used for. My new chess engine has no evaluation function and therefore move generation speed becomes more important. Also bitboards lose to this even in generating captures only. Once the bishop code is entered these three instructions check every square on an empty board until it finds a capture or it runs out of squares.

Code: Select all

wbcns: ; white bishop capture next square
            movsx       ebx,[bns+esi+ebx]
            mov         edx,[brd+ebx*4]
            jmp         [wbcf+edx*4]
And not everyone programs for multiple platforms. And maybe someone can use this code for whatever reason. Or maybe no one will care to uses it but might be interested for historic reasons. I don't understand why anyone would put this down just because it was posted? Strange, very strange!

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 10:18 am
by Ras
Michael Sherwin wrote:
Sat Sep 07, 2019 9:31 am
My new chess engine has no evaluation function and therefore move generation speed becomes more important.
Not having an evaluation function? So it just generates moves, searches through trees, and then doesn't evaluate them? Did you actually benchmark how much time the move generation takes?
I don't understand why anyone would put this down just because it was posted?
This is a forum. It is meant for discussion. If you don't want discussion, use a blog without comment function instead.

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 10:21 am
by fabianVDW
AFAIK, movegeneration speed is a good chunk of the runtime in modern engines. Atleast in FabChess, it is about 25-30% of the time used, being on par with the evaluation function (IIRC).

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 10:42 am
by Michael Sherwin
Ras wrote:
Sat Sep 07, 2019 10:18 am
Michael Sherwin wrote:
Sat Sep 07, 2019 9:31 am
My new chess engine has no evaluation function and therefore move generation speed becomes more important.
Not having an evaluation function? So it just generates moves, searches through trees, and then doesn't evaluate them? Did you actually benchmark how much time the move generation takes?
I don't understand why anyone would put this down just because it was posted?
This is a forum. It is meant for discussion. If you don't want discussion, use a blog without comment function instead.
On an i7- 3930k overclocked to 4.4 GHz using only one thread in the opening position the assembler version generates 75 million positions per second. That includes all moves made and unmade and no hash counting. On an i9-9900k overclocked to 5GHz it would do double that.

My new engine has no evaluation function because it is a statistical driven engine. It is basically a material only searcher that accumulates statistics as it searches creating a winning percentage chance for each root move.

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 10:57 am
by Joost Buijs
fabianVDW wrote:
Sat Sep 07, 2019 10:21 am
AFAIK, movegeneration speed is a good chunk of the runtime in modern engines. Atleast in FabChess, it is about 25-30% of the time used, being on par with the evaluation function (IIRC).
25% to 30% of the time used by the move generation is quite a lot. Maybe it depends upon staged move generation or not and whether you execute the hash move with or without move generation, when I profile my engine move generation takes considerably less than 10% of the total time used.

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 11:15 am
by mar
Ras wrote:
Sat Sep 07, 2019 10:18 am
Not having an evaluation function? So it just generates moves, searches through trees, and then doesn't evaluate them? Did you actually benchmark how much time the move generation takes?
Move generator really is essential part of any chess engine. Most of us had to actually debug it ;)
(you don't need evaluation for perft to make sure your move generator works)

You don't start writing a chess program with evaluation function, you start with material only.
Only when everything is working as it should it makes sense to continue working on evaluation.

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 12:12 pm
by zullil
Michael Sherwin wrote:
Sat Sep 07, 2019 10:42 am

On an i7- 3930k overclocked to 4.4 GHz using only one thread in the opening position the assembler version generates 75 million positions per second. That includes all moves made and unmade and no hash counting. On an i9-9900k overclocked to 5GHz it would do double that.
Why would going from 4.4 GHz to 5.0 Ghz double the speed of the move generator? What else does the i9 provide?

Re: Back To The Beginning

Posted: Sat Sep 07, 2019 2:46 pm
by Joost Buijs
I determined the time for a complete move-generation over a large number of random positions, for my engine it is on average 274 cycles on a Broadwell processor, at 4 GHz. this amounts to 68.5 ns. My evaluation function runs at approx. 680 cycles or ~170 ns. In practice though I use staged move generation, when the hash-move generates a beta cut-off (which happens often) I don't have to call the move generator at all, this is the reason that the efficiency of move generation has hardly any influence on the engine overall.