The mailbox trials

Discussion of chess software programming and technical issues.

Moderator: Ras

User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

OK, I added null-move pruning. This had the expected effect:

Code: Select all

 1  -16       725 0 e2a6 b4c3 b2c3 e6d5
 2  -16      9497 0 e2a6 b4c3 b2c3 e6d5
 3  -27     12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 4  -27     71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 5  -37    149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
 6  -32    399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 7  -32    838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 8  -32   4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t =  1.380 sec
   4710223 nodes (3.4 Mnps)
   4409258 QS (93.6%)
    905777 evals (20.5%)
    350015 stand-pats (7.9%)
   4134586 move gens
captures: 94.1%
The fraction of non-captures is strongly reduced; only 6% of the real moves is now a non-capture. And far less nodes are needed in total to get the same depth. We also see that QS on the average gets more difficult: there are nearly 15 times as many QS as other nodes now, meaning that each QS must have had on the average at least 15 nodes in its sub-tree. This is because NMP makes the search postpone captures when it is already ahead, preferring to null-move instead, and leaving the resolution of the position to QS.

The complete code I have now is:

Code: Select all

#include <stdio.h>
#include <time.h>
#include <ctype.h>

#define PATH 0 // ply==0 || path[0]==0x1450 && (ply==1 || path[1]==0x3122 && (ply==2 || path[2]==0x1122 && (ply==3 || path[3]==0x5443 && (ply==4))))

#define WHITE 16
#define BLACK 32
#define COLOR (WHITE|BLACK) // also used for edge guards

#define INF 8000

#define CAPTURED 255
#define MARGIN   30         // for futility pruning / lazy eval
#define STMKEY   123456789

#define KIWIPETE "r3k2r/p1ppqpb1/bn2pnp1/3PN3/1p2P3/2N2Q1p/PPPBBPPP/R3K2R w KQkq - 0 1"
#define FIDE     "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1"

#define PST(PIECE, SQR) pstData[offs[PIECE] + SQR]
#define KEY(PIECE, SQR) zobrist[offs[PIECE] + SQR]

int pstData[7*128];
long long int zobrist[7*128];


unsigned char rawBoard[12*16];  // 0x88 board with 2-wide rim of edge guards
#define board (rawBoard + 2*17)

int stm; // side to move (WHITE or BLACK)

int location[48], offs[48], mvv[48]; // piece list

int firstDir[32] = { // offset of move list in steps[] table, per piece
  1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 31, 31, 36, 36, 27, 18, // white pieces
  5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 31, 31, 36, 36, 27, 18, // black pieces
};

signed char steps[] = {
  0,
  16, 15, 17, 0,                         //  1 wP
  -16, -15, -17, 0,                      //  5 bP
  31, -31, 33, -33, 18, -18, 14, -14, 0, //  9 N
  16, -16, 1, -1, 15, -15, 17, -17, 0,   // 18 K
  16, -16, 1, -1, 15, -15, 17, -17, 0,   // 27 Q   31 B
  16, -16, 1, -1, 0,                     // 36 R
};

unsigned int moveStack[10000];
int msp; // move stack pointer

int path[100], ply;
int pv[10000];
int *pvPtr = pv;
int followPV;

int nodeCnt, patCnt, genCnt, evalCnt, qsCnt, captCnt[2]; // for statistics

int pType[] = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 6 };
int pieceVal[] = { 0, 90, 325, 350, 500, 950, 0 };

void Init()
{
  int i, r, f;
  char *p;
  // board with rim
  for(i=0; i<12*16; i++) rawBoard[i] = COLOR;
  for(i=0; i<16; i++) {
    offs[i+WHITE] = 128*pType[i];
    offs[i+BLACK] = 128*pType[i] + 8;
    mvv[i+WHITE] = mvv[i+BLACK] = 16*pType[i] << 24;
  }
  for(r=0; r<8; r++) for(f=0; f<8; f++) {
    int s = 16*r+f;
    int d = 14 - (r-3.5)*(r-3.5) - (f-4)*(f-4);
    board[s] = 0;
    for(i=1; i<6; i++) {
      pstData[128*i+s] =
      pstData[128*i+(s^0x70)+8] = pieceVal[i] + d*(i < 4) + 9*(i == 1)*r;
    }
    pstData[128*6+s] = pstData[128*6+(s^0x70)+8] = -d;
  }
  p = (char*) (zobrist + 128);
  for(i=0; i<6*8*128; i++) *p++ = rand()*rand() >> 6;
}

char *Move2text(int move)
{
  static char buf[10];
  sprintf(buf, "%c%d%c%d", (move >> 8 & 7) + 'a', (move >> 12 & 7) + 1, (move &7) + 'a', (move >> 4 & 7) + 1);
  return buf;
}

#define MOVE(F, T) (256*(F) + (T))

int MoveGen()
{
  int i, first = msp;

  for(i=0; i<16; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int step, dir = firstDir[piece-16];
      while((step = steps[dir++])) {
        int to = from;
        do {
         int victim = board[to += step];
          if(victim) {                         // target square not empty
            if(victim & stm) break;            // own piece or edge guard
            if(!(piece & 8)) {                 // is Pawn
              if(!(step & 7)) break;           // straight
            }
            if((victim & 15) == 15) return 0;  // captures King
            moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
            break;
          } else {                             // non-capture
            if(!(piece & 8)) {                 // is Pawn
              if(step & 7) break;              // diagonal
              if(from - 32 & 64 && !board[to + step])
                moveStack[msp++] = MOVE(from, to + step);
            }
            moveStack[msp++] = MOVE(from, to); // generate non-capture
          }
        } while(dir >= 27);
      }
    }
  }
  return first;
}

int Evaluate(int pstEval)
{
  return pstEval;
}

typedef struct {
  long long int hashKey, oldKey; // keys
  int pstEval, oldEval, curEval; // scores
  int alpha, beta;               // scores
  int from, to;                  // squares
  int piece, victim;             // pieces
  int depth;                     // depth
} UndoInfo;

int MakeMove(int move, UndoInfo *u)
{
  // decode the move
  u->to = move & 255;
  u->from = move >> 8 & 255;
  u->piece = board[u->from];
  u->victim = board[u->to];

  // update the incremental evaluation
  u->pstEval = -(u->oldEval + PST(u->piece, u->to) - PST(u->piece, u->from) + PST(u->victim, u->to));
//if(PATH) printf("     eval=%d beta=%d MARGIN=%d\n", u->pstEval, u->beta, MARGIN);
  if(u->depth <= 0 && u->pstEval > u->beta + MARGIN) return -INF-1; // futility (child will stand pat)

  // update hash key, and possibly abort on repetition
  u->hashKey =   u->oldKey  ^ KEY(u->piece, u->to) ^ KEY(u->piece, u->from) ^ KEY(u->victim, u->to);
  // if(REPEAT) return 0;

  // update board and piece list
  board[u->from] = 0;
  board[u->to]   = u->piece;
  location[u->piece]  = u->to;
  location[u->victim] = CAPTURED;

path[ply++] = move & 0xFFFF; captCnt[!u->victim]++;
  return INF+1; // kludge to indicate success
}

void UnMake(UndoInfo *u)
{
  // restore board and piece list
ply--;
  board[u->from] = u->piece;
  board[u->to]   = u->victim;
  location[u->piece]  = u->from;
  location[u->victim] = u->to;
}

int Search(UndoInfo *u) // pass all parameters in a struct
{
  UndoInfo undo;
  int first, curMove, noncapts, alpha = u->alpha;
  int *myPV = pvPtr;

  nodeCnt++;
  *pvPtr++ = 0; // empty PV

  // QS / stand pat
  if(u->depth <= 0) {
    qsCnt++;
    if(u->pstEval > alpha - MARGIN) { // don't bother with full eval if hopeless
      undo.curEval = Evaluate(u->pstEval); evalCnt++;
      if(undo.curEval > alpha) {
        if(undo.curEval >= u->beta) { patCnt++; return u->beta; }
        alpha = undo.curEval;
      }
    }
  } else if(u->pstEval > u->beta - MARGIN) { // null-move pruning
    int score;
    undo.hashKey =  u->hashKey ^ STMKEY;
    undo.pstEval = -u->pstEval;
    undo.alpha   = -u->beta;
    undo.beta    = 1 - u->beta;
    undo.depth   = (u->depth > 3 ? u->depth - 3 : 0);
    stm ^= COLOR;
    score = -Search(&undo);
    stm ^= COLOR;
    if(score >= u->beta) return u->beta;
  }

  // generate moves
  noncapts = msp += 70;          // reserve space for new move list
  first = MoveGen();
  genCnt++;
  if(!first) { alpha = INF; goto abort; }  // King capture detected
  if(followPV >= 0) {                      // first branch
    int i, m = pv[followPV++];             // get the move
    if(m) {                                // move to follow
      m |= 255<<24;                        // assign highest sort priority
      for(i=first; i<msp; i++) {           // run through move list
        if(!(m - moveStack[i] & 0xFFFF)) { // search it amongst generated moves
          moveStack[--first] = m;          // prepend to move list
          moveStack[i] = 0;                // zap PV move in list
          break;
        }
      }
    } else followPV = -1;
  }

  // set child & make-move parameters that are always the same
  undo.oldKey  =  u->hashKey ^ STMKEY;
  undo.oldEval =  u->pstEval;
  undo.depth   =  u->depth - 1;
  undo.alpha   = -u->beta;
  stm ^= COLOR;

  // move loop
  for(curMove = first; curMove < msp; curMove++) {
    int score;
    unsigned int move = moveStack[curMove];
    // move picker
    if(curMove < noncapts) { // still has to be sorted
      int i, j = curMove;
      for(i=curMove+1; i<noncapts; i++) { // extract best capture
        unsigned int m = moveStack[i];
        if(m > move) j = i, move = m;
      }
      moveStack[j] = moveStack[curMove]; moveStack[curMove] = move;
    } else if(u->depth <= 0) { // in QS no non-captures at all
      break;
    }
    if(!move) continue;        // skip zapped moves

    // recursion
    undo.beta = -alpha;
    score = MakeMove(move, &undo); // rejected moves get their score here
    if(score < -INF) break;        // move is futile, and so will be all others
    if(score > INF) { // move successfully made
      score = -Search(&undo);
      UnMake(&undo);
    }
if(PATH) printf("%2d:%d %3d. %08x %s %5d %5d\n", ply, u->depth, curMove, move, Move2text(move), score, alpha), fflush(stdout);
    // minimaxing
    if(score > alpha) {
      int *p;
      if(score >= u->beta) {
        alpha = u->beta;
        break;
      }
      alpha = score;
      p = pvPtr; pvPtr = myPV;    // pop old PV
      *pvPtr++ = move;            // push new PV, starting with this move
      while((*pvPtr++ = *p++)) {} // and append child PV
    }
  }
  stm ^= COLOR;
 abort:
  msp = noncapts - 70; // pop move list
  pvPtr = myPV;        // pop PV (but remains above stack top)
  return alpha;
}

void SearchRoot(UndoInfo *u, int maxDepth)
{
  nodeCnt = patCnt = evalCnt = genCnt = qsCnt = captCnt[0] = captCnt[1] = 0;
  u->alpha = -INF;
  u->beta  = INF;
  followPV = -1;   // nothing to follow at d=1
  for(u->depth=1; u->depth<=maxDepth; u->depth++) { // iterative deepening
    int i, score;
    score = Search(u);
    printf("%2d %4d %9d %d", u->depth, score, nodeCnt, 0);
    for(i=0; pv[i]; i++) printf(" %s", Move2text(pv[i]));
    printf("\n"), fflush(stdout);
    followPV = 0;  // follow this PV on next search
  }
}

int Setup(UndoInfo *u, char *fen)
{
  static char pieces[] = "PPPPPPPPNNBBRRQK";
  int r, f, i;
  for(i=WHITE; i<COLOR; i++) location[i] = CAPTURED;
  for(r=0; r<8; r++) for(f=0; f<8; f++) board[16*r+f] = 0;
  u->pstEval = 0; u->hashKey = 0;

  r = 7; f = 0;
  while(*fen) {
    if(*fen == '/') r--, f = 0;
    else if(*fen > '0' && *fen <= '9') f += *fen - '0'; // empties
    else if(*fen >= 'A') {                              // piece
      int color = (*fen >= 'a' ? BLACK : WHITE);
      for(i=0; i<16; i++) {
        if(location[color+i] == CAPTURED && !(*fen - pieces[i] & 31)) {
          location[color+i] = 16*r + f;
          board[16*r+f] = color + i;
          u->pstEval += PST(color + i, 16*r + f)*(color == WHITE ? 1 : -1);
          u->hashKey ^= KEY(color + i, 16*r + f);
          break;
        }
      }
      f++;
    } else break;
    fen++;
  }
  while(*fen == ' ') fen++;
  if(*fen == 'b') u->pstEval *= -1;
  return (*fen == 'b' ? BLACK : WHITE);
}

UndoInfo root;

int main()
{
  time_t t;
  Init();
  stm = Setup(&root, FIDE);
  stm = Setup(&root, KIWIPETE);
  t = clock();
  SearchRoot(&root, 8);
  t = clock() - t;
  printf("t = %6.3f sec\n", t * (1./CLOCKS_PER_SEC));
  printf("%10d nodes (%3.1f Mnps)\n%10d QS (%3.1f%%)\n", nodeCnt, nodeCnt*1e-6*CLOCKS_PER_SEC/t, qsCnt, qsCnt*100./nodeCnt);
  printf("%10d evals (%3.1f%%)\n%10d stand-pats (%3.1f%%)\n", evalCnt, evalCnt*100./qsCnt, patCnt, patCnt*100./qsCnt);
  printf("%10d move gens\ncaptures: %3.1f%%\n", genCnt, captCnt[0]*100./(captCnt[0] + captCnt[1]));
  return 0;
}
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

Using a separate capture generator, which refrains from putting the non-captures in the move list, does give some 10% speedup:

Code: Select all

 1  -16       725 0 e2a6 b4c3 b2c3 e6d5
 2  -16      9497 0 e2a6 b4c3 b2c3 e6d5
 3  -27     12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 4  -27     71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 5  -37    149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
 6  -32    399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 7  -32    838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 8  -32   4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t =  1.230 sec
   4710223 nodes (3.8 Mnps)
   4409258 QS (93.6%)
    905777 evals (20.5%)
    350015 stand-pats (7.9%)
   4134586 move gens
captures: 94.1%
This was done by removing the else-part of the if(victim) clause in the inner loop of the move generator:

Code: Select all

int CaptGen()
{
  int i, first = msp;

  for(i=0; i<16; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int step, dir = firstDir[piece-16];
      while((step = steps[dir++])) {
        int to = from;
        do {
          int victim = board[to += step];
          if(victim) {                         // target square not empty
            if(victim & stm) break;            // own piece or edge guard
            if(!(piece & 8)) {                 // is Pawn
              if(!(step & 7)) break;           // straight
            }
            if((victim & 15) == 15) return 0;  // captures King
            moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
            break;
          }
        } while(dir >= 27);
      }
    }
  }
  return first;
}
This CaptGen() is then called instead of MoveGen() in nodes with u->depth <= 0. This still misses the opportunity to also speed up d=1 nodes where the non-captures will be futile (i.e. u->pstEval < u->alpha - MARGIN). But, considering that 94% of the nodes is QS, and the speedup is only 10%, achieving that speedup in part of the remaining 6% of nodes seems hardly worth it.
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

This could still be optimized a bit (a speedup of 25% (i.e. 1.25x) instead of 12% by a dedicated capture generator):

Code: Select all

 1  -16       725 0 e2a6 b4c3 b2c3 e6d5
 2  -16      9497 0 e2a6 b4c3 b2c3 e6d5
 3  -27     12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 4  -27     71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 5  -37    149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
 6  -32    399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 7  -32    838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 8  -32   4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t =  1.100 sec
   4710223 nodes (4.3 Mnps)
   4409258 QS (93.6%)
    905777 evals (20.5%)
    350015 stand-pats (7.9%)
   4134586 move gens
captures: 94.1%
This is done by using only the part of the list of move steps for the Pawn that has the diagonal steps. So that the test for whether the piece is a Pawn and the move is straight when it hits an enemy can be omitted:

Code: Select all

int firstDir[32] = { // offset of move list in steps[] table, per piece
  1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 31, 31, 36, 36, 27, 18, // white pieces
  5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 31, 31, 36, 36, 27, 18, // black pieces
};

int firstCapt[32] = { // offset of move list in steps[] table, per piece (captures)
  2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 31, 31, 36, 36, 27, 18, // white pieces
  6, 6, 6, 6, 6, 6, 6, 6, 9, 9, 31, 31, 36, 36, 27, 18, // black pieces
};

signed char steps[] = {
  0,
  16, 15, 17, 0,                         //  1 wP
  -16, -15, -17, 0,                      //  5 bP
  31, -31, 33, -33, 18, -18, 14, -14, 0, //  9 N
  16, -16, 1, -1, 15, -15, 17, -17, 0,   // 18 K
  16, -16, 1, -1, 15, -15, 17, -17, 0,   // 27 Q   31 B
  16, -16, 1, -1, 0,                     // 36 R
};

int CaptGen()
{
  int i, first = msp;

  for(i=0; i<16; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int step, dir = firstCapt[piece-16];
      while((step = steps[dir++])) {
        int to = from;
        do {
          int victim = board[to += step];
          if(victim) {                         // target square is occupied
            if(victim & stm) break;            // own piece or edge guard
            if((victim & 15) == 15) return 0;  // captures King
            moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
            break;
          }
        } while(dir >= 27);
      }
    }
  }
  return first;
}
Mike Sherwin
Posts: 965
Joined: Fri Aug 21, 2020 1:25 am
Location: Planet Earth, Sol system
Full name: Michael J Sherwin

Re: The mailbox trials

Post by Mike Sherwin »

MSVS 2019 Community, AMD 3950x 4.2 GHz, ram 3600MHz

x86 compile:

1 -16 725 0 e2a6 b4c3 b2c3 e6d5
2 -16 9497 0 e2a6 b4c3 b2c3 e6d5
3 -27 12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
4 -27 71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
5 -37 149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
6 -32 399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
7 -32 838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
8 -32 4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t = 0.938 sec
4710223 nodes (5.0 Mnps)
4409258 QS (93.6%)
905777 evals (20.5%)
350015 stand-pats (7.9%)
4134586 move gens
captures: 94.1%

x64 compile:

1 -16 725 0 e2a6 b4c3 b2c3 e6d5
2 -16 9497 0 e2a6 b4c3 b2c3 e6d5
3 -27 12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
4 -27 71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
5 -37 149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
6 -32 399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
7 -32 838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
8 -32 4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t = 0.875 sec
4710223 nodes (5.4 Mnps)
4409258 QS (93.6%)
905777 evals (20.5%)
350015 stand-pats (7.9%)
4134586 move gens
captures: 94.1%
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

As generating the captures obviously is the critical task here, I did some further optimizations: the loop over all pieces is partly unrolled, broken in 3 sections: leapers (P & N), sliders (B, R, Q) and King. Each of those can then be optimized for the kind of piece they address. The King part obviously does not need a loop around it, as there is only one King. And it should be always present, so no need to test whether it is CAPTURED. And we don't have to look up where its list of steps starts, we know that.

Furthermore the King, as well as the other leapers, don't need a loop over distance. This also makes the test for the target simpler, as there are only two outcomes: capture the target, or go on with the next direction. For sliders there is a third option, continue in same direction, so that two tests were needed to distinguish them.

In the slider loop we can make the loop over distance unconditional. All together this gives a significant improvement, and the dedicated capture generation has bought us a 47% speed increase, to 5Mnps.

Code: Select all

 1  -16       725 0 e2a6 b4c3 b2c3 e6d5
 2  -16      9497 0 e2a6 b4c3 b2c3 e6d5
 3  -27     12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 4  -27     71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 5  -37    149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
 6  -32    399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 7  -32    838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 8  -32   4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t =  0.940 sec
   4710223 nodes (5.0 Mnps)
   4409258 QS (93.6%)
    905777 evals (20.5%)
    350015 stand-pats (7.9%)
   4134586 move gens
captures: 94.1%
The code for the capture generator is shown below. It uses a not-so-obvious trick for distinguishing foes from anything else (i.e. empty, friend or edge guard): XOR with stm. If the victim is a foe, it then gets both color bits set. OTOH, a friend gets all its color bits cleared, an edge guard (which already had both bits set) becomes an opponent, and an empty square becomes a friend. Only the case with both color bits set is >= COLOR.

Code: Select all

int CaptGen2()
{
  int i, first = msp;

  for(i=0; i<10; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int step, dir = firstCapt[piece-16];
      while((step = steps[dir++])) {
        int to = from + step;
        int victim = board[to];
        int h = victim ^ stm;          // gets >= COLOR if colr bits complement each other
        if(h >= COLOR) {               // target is foe
          if(h == COLOR+15) return 0;  // captures King
          moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
        }
      }
    }
  }
  for(i=10; i<15; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int step, dir = firstCapt[piece-16];
      while((step = steps[dir++])) {
        int to = from;
        do {
          int victim = board[to += step];
          if(victim) {                         // target square is occupied
            if(victim & stm) break;            // own piece or edge guard
            if((victim & 15) == 15) return 0;  // captures King
            moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
            break;
          }
        } while(dir >= 27);
      }
    }
  }
  if(1) {
    int piece = stm + 15;
    int from = location[piece];
    int step, dir = 18;
    while((step = steps[dir++])) {
      int to = from + step;
      int victim = board[to];
      int h = victim ^ stm;          // gets >= COLOR if colr bits complement each other
      if(h >= COLOR) {               // target is foe
        if(h == COLOR+15) return 0;  // captures King
        moveStack[--first] = MOVE(from, to) + mvv[victim] - (15 << 24);
      }
    }
  }
  return first;
}
Having the sliders separated from the leapers is also good preparation for an implementation where the sliders use a neighbor table to get at their targets, instead of a ray scan.
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

OK, the first real algorithmic improvement! I implemented a neighbor table, to avoid the ray scans that were used to generate slider captures. The slider loop in the capture generator has been changed to:

Code: Select all

  for(i=10; i<15; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int d, dir = firstCapt[piece-16];
      while((d = steps[dir++ + 14])) {      // direction nr (instead of board step) stored 14 further
        int to = neighbor[from].d[d-1];     // the directions were offset by 1 to avoid the 0 sentinel
        int victim = board[to];
        int h = victim ^ stm;
        if(h >= COLOR) {                         // target square is occupied
          if((victim & 15) == 15) return 0; // captures King
          moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
        }
      }
    }
  }
This required two extra lines in the steps[] table, which are not really steps, but the number of the direction in which the step goes. (One was added, because 0 is a valid direction number, which would clash with the use of 0 as sentinel to terminate the list.) Like this:

Code: Select all

signed char steps[] = {
  0,
  16, 15, 17, 0,                         //  1 wP
  -16, -15, -17, 0,                      //  5 bP
  31, -31, 33, -33, 18, -18, 14, -14, 0, //  9 N
  16, -16, 1, -1, 15, -15, 17, -17, 0,   // 18 K
  16, -16, 1, -1, 15, -15, 17, -17, 0,   // 27 Q   31 B
  16, -16, 1, -1, 0,                     // 36 R
   1, 5, 3, 7, 8, 4, 2, 6, 0,            // Q & B slides in terms of direction number
   1, 5, 3, 7, 0,                        // R slides in terms of direction number
};
Of course the neighbor table needs to be (incrementally) updated. The following routines were added to do that:

Code: Select all

// Neighbor table

typedef union {
  long long int all;
  unsigned char d[8];
} Neighbor;

Neighbor neighbor[256];

int vec[8] = { 16, 17, 1, -15, -16, -17, -1, 15 };

void Evacuate(int sqr)
{
  int d;
  for(d=0; d<4; d++) {
    int up = neighbor[sqr].d[d];
    int dn = neighbor[sqr].d[d+4];
    neighbor[up].d[d+4] = dn;
    neighbor[dn].d[d] = up;
  }
}

void Reoccupy(int sqr)
{
  int d;
  for(d=0; d<4; d++) {
    int up = neighbor[sqr].d[d];
    int dn = neighbor[sqr].d[d+4];
    neighbor[up].d[d+4] = neighbor[dn].d[d] = sqr;
  }
}

void Occupy(int sqr)
{
  int d;
  for(d=0; d<4; d++) {
    int dn, up = sqr, v = vec[d];
    while(!board[up+=v]) {}
    up &= 255; // Yegh!
    dn = neighbor[up].d[d+4];
    neighbor[up].d[d+4] = sqr;
    neighbor[dn].d[d] = sqr;
    neighbor[sqr].d[d+4] = dn;
    neighbor[sqr].d[d] = up;
  }
}
There are two routines for occupying a square. Reoccupy() is used for a square that was evacuated before, and still holds the neighbor data. This information can then be used to quickly get at these neighbors (which now all see the occupied square as a neighbor, instead of each other) for updating. This typically happens during UnMake(), when we put the moved piece back on the from-square.

More troublesome is the case where we land on an empty square in MakeMove() due to a noncapture (Occupy()). In that case we know nothing. The data for that square in the neighbor table will not be valid. And we must even be careful not to alter it, because a move before, which we still want to take back, could have evacuated it. And the UnMake() through Reoccupy() relies on that data still being there. So we have to save and restore. And collect the current neighbor data from scratch. For this we do have to make ray scans over the board. So we haven't got rid of those completely.

But we only have to do 4 ray scans, as much work as during generation of the captures by a Rook: once the scan hits an obstacle (could be an edge guard; the neighbors of these are in the table too), we can use the information there to immediately get to the neighbor in the opposit directon. And only 6% of the moves are non-captures. The routines for updating the neighbor table are now called from MakeMove() and UnMake() like:

Code: Select all

typedef struct {
  long long int hashKey, oldKey; // keys
  Neighbor savedNeighbor;        // 8 neighbors
  int pstEval, oldEval, curEval; // scores
  int alpha, beta;               // scores
  int from, to;                  // squares
  int piece, victim;             // pieces
  int depth;                     // depth
} UndoInfo;

int MakeMove(int move, UndoInfo *u)
{
  // decode the move
  u->to = move & 255;
  u->from = move >> 8 & 255;
  u->piece = board[u->from];
  u->victim = board[u->to];

  // update the incremental evaluation
  u->pstEval = -(u->oldEval + PST(u->piece, u->to) - PST(u->piece, u->from) + PST(u->victim, u->to));
//if(PATH) printf("     eval=%d beta=%d MARGIN=%d\n", u->pstEval, u->beta, MARGIN);
  if(u->depth <= 0 && u->pstEval > u->beta + MARGIN) return -INF-1; // futility (child will stand pat)

  // update hash key, and possibly abort on repetition
  u->hashKey =   u->oldKey  ^ KEY(u->piece, u->to) ^ KEY(u->piece, u->from) ^ KEY(u->victim, u->to);
  // if(REPEAT) return 0;

  // update board and piece list
  board[u->from] = 0;
  board[u->to]   = u->piece;
  location[u->piece]  = u->to;
  location[u->victim] = CAPTURED;

  Evacuate(u->from);
  if(!u->victim) u->savedNeighbor = neighbor[u->to], Occupy(u->to);
  return INF+1; // kludge to indicate success
}

void UnMake(UndoInfo *u)
{
  // restore board and piece list
  board[u->from] = u->piece;
  board[u->to]   = u->victim;
  location[u->piece]  = u->from;
  location[u->victim] = u->to;
  if(!u->victim) Evacuate(u->to), neighbor[u->to] = u->savedNeighbor;
  Reoccupy(u->from);
}
There is one problem that makes me regret I chose 0x88 board layout. I want the 8 neighbors in the different directions from a given square to all reside in 64-bit word, for easy saving and restoring. But since some of the neighbors are rim squares, the numbers can get both negative (0th rank) and larger than 128 (9th rank). So neither a signed nor an unsigned char can hold all the required values. They could if I stored offsetted values, but then I would have to translate those every time I want to apply the square numbers to the board. Unless I use offsetted numbering there too. I currently solved this in a kudgy way. Make both the board and the neighbor table 16x16, and use unsigned char in the neighbor table. In the latter the rim squares on rank 0 then wrap to numbers just below 256, and when used on the board these would hit edge guards too, (but not on 0th rank but on 16th rank) in the excessively large rim of the board. Ugly, but it works. And it does indeed give a speedup:

Code: Select all

 1  -16       725 0 e2a6 b4c3 b2c3 e6d5
 2  -16      9497 0 e2a6 b4c3 b2c3 e6d5
 3  -27     12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 4  -27     71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 5  -37    149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
 6  -32    399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 7  -32    838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 8  -32   4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t =  0.820 sec
   4710223 nodes (5.7 Mnps)
   4409258 QS (93.6%)
    905777 evals (20.5%)
    350015 stand-pats (7.9%)
   1239812 leaves (28.1%)
   4134586 move gens
captures: 94.1%
I also collect a new statistic here: the number of leaf nodes. These are defined as nodes where move generation was done (so no stand-pat cutoff), but no move was searched (all futile or repetitions). As expected there are lots of those, although not as many as I had hoped. Together with the stand-pats they make 36% of all QS nodes. So the QS, which on average have 15 nodes, apparently have a pretty low branching factor, and are more deep than wide.

The reason I am interested in that statistic is that the relatively expensive incremental updating of the auxiliary data structures (like the neighbor table) can largely be avoided in leaf nodes. (Where you would have to immediately unmake the changes without having had much benefit from them.) So far the neighbor table is used only during move gen, and doing a move gen with a stale table would erroneously have some slider moves capture to the square that was evacuated, rather than to the target behind it. This could be solved during move generation as well, by elongating such moves to the next target. The neighbor update could then be deferred to just before searching of the first move. Which might be never, as we might be in a leaf.
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

Mike Sherwin wrote: Sun Mar 28, 2021 4:47 pm MSVS 2019 Community, AMD 3950x 4.2 GHz, ram 3600MHz
...
t = 0.875 sec
4710223 nodes (5.4 Mnps)
4409258 QS (93.6%)
905777 evals (20.5%)
350015 stand-pats (7.9%)
4134586 move gens
captures: 94.1%
One question: this was a compilation of the 'complete code' I posted somewhat earlier (which did 3.4Mnps for me)? When I scale my execution time with the ratio of our clock frequencies, you are quite a bit faster (0.875 sec vs 1.051 sec). So your compiler seems faster. (My compile is i386.)
Mike Sherwin
Posts: 965
Joined: Fri Aug 21, 2020 1:25 am
Location: Planet Earth, Sol system
Full name: Michael J Sherwin

Re: The mailbox trials

Post by Mike Sherwin »

hgm wrote: Sun Mar 28, 2021 11:17 pm
Mike Sherwin wrote: Sun Mar 28, 2021 4:47 pm MSVS 2019 Community, AMD 3950x 4.2 GHz, ram 3600MHz
...
t = 0.875 sec
4710223 nodes (5.4 Mnps)
4409258 QS (93.6%)
905777 evals (20.5%)
350015 stand-pats (7.9%)
4134586 move gens
captures: 94.1%
One question: this was a compilation of the 'complete code' I posted somewhat earlier (which did 3.4Mnps for me)? When I scale my execution time with the ratio of our clock frequencies, you are quite a bit faster (0.875 sec vs 1.051 sec). So your compiler seems faster. (My compile is i386.)
Yes. That was a 64 bit compile. Had to make 3 small changes to get it to compile but nothing that would impact speed. And no special compiler settings, just release x64. I could try a few compiler optimizations and report back tomorrow. In the meantime if you want a newer version compiled either post it or send me a pm.
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

Mike Sherwin wrote: Mon Mar 29, 2021 2:28 amIn the meantime if you want a newer version compiled either post it or send me a pm.
When I have reached a new milestone I will do that.

First a few more 'micro-optimalizations'. So far the move generation depended on piece type, but not on the piece location. With as a result that a Rook always tries to move in 4 directions. While two of these immediately slam into the edge when the Rook is in a corner. This can be prevented by making the list of directions a piece moves in not just dependent on the piece type, but also on the piece's location. The slider loop of CaptGen() was therefore adapted to get the start of the 'direction list' in a new array slides[] from a new piece-square-like table firstSlide:

Code: Select all

  for(i=10; i<15; i++) {
    int piece = stm + i;
    int from = location[piece];
    if(from != CAPTURED) {
      int d, dir = firstSlide[slideOffs[piece-16] + from]; // list of directions now location dependent!
      while((d = slides[dir++])) {          // direction nr (+1)
        int to = neighbor[from].d[d-1];     // the directions were offset by 1 to avoid the 0 sentinel
        int victim = board[to];
        int h = victim ^ stm;
        if(h >= COLOR) {                         // target square is occupied
          if((victim & 15) == 15) return 0; // captures King
          moveStack[--first] = MOVE(from, to) + mvv[victim] - ((piece & 15) << 24);
        }
      }
    }
  }
The new tables this involves are:

Code: Select all

unsigned char slideOffs[] = { // offset of board-size table in firstSlide[], per piece
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 8, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 8, 0, 0,
};

unsigned char firstSlide[] = {
  // Queen                             Rook
  45, 21, 21, 21, 21, 21, 21, 41,      69, 58, 58, 58, 58, 58, 58, 66,
  27,  0,  0,  0,  0,  0,  0, 15,      62, 49, 49, 49, 49, 49, 49, 54,
  27,  0,  0,  0,  0,  0,  0, 15,      62, 49, 49, 49, 49, 49, 49, 54,
  27,  0,  0,  0,  0,  0,  0, 15,      62, 49, 49, 49, 49, 49, 49, 54,
  27,  0,  0,  0,  0,  0,  0, 15,      62, 49, 49, 49, 49, 49, 49, 54,
  27,  0,  0,  0,  0,  0,  0, 15,      62, 49, 49, 49, 49, 49, 49, 54,
  27,  0,  0,  0,  0,  0,  0, 15,      62, 49, 49, 49, 49, 49, 49, 54,
  33,  9,  9,  9,  9,  9,  9, 37,      63, 50, 50, 50, 50, 50, 50, 55,
  // Bishop                             unused
  84, 83, 83, 83, 83, 83, 83, 91,       0,  0,  0,  0,  0,  0,  0,  0,
  86, 72, 72, 72, 72, 72, 72, 80,       0,  0,  0,  0,  0,  0,  0,  0,
  86, 72, 72, 72, 72, 72, 72, 80,       0,  0,  0,  0,  0,  0,  0,  0,
  86, 72, 72, 72, 72, 72, 72, 80,       0,  0,  0,  0,  0,  0,  0,  0,
  86, 72, 72, 72, 72, 72, 72, 80,       0,  0,  0,  0,  0,  0,  0,  0,
  86, 72, 72, 72, 72, 72, 72, 80,       0,  0,  0,  0,  0,  0,  0,  0,
  86, 72, 72, 72, 72, 72, 72, 80,       0,  0,  0,  0,  0,  0,  0,  0,
  89, 77, 77, 77, 77, 77, 77, 81,       0,  0,  0,  0,  0,  0,  0,  0,
};

signed char slides[] = { // 0-terminated sets of directions sliders can move in
   1, 5, 3, 7, 8, 4, 2, 6, 0,   // 0   Q
   5, 3, 7, 4, 6, 0,            // 9
   1, 5, 7, 8, 6, 0,            // 15
   1, 3, 7, 8, 2, 0,            // 21
   1, 5, 3, 4, 2, 0,            // 27
   5, 3, 4, 0,                  // 33
   5, 7, 6, 0,                  // 37
   1, 7, 8, 0,                  // 41
   1, 3, 2, 0,                  // 45
   1, 5, 3, 7, 0,               // 49, 50  R
   1, 5, 7, 0,                  // 54, 55
   1, 3, 7, 0,                  // 58
   1, 5, 3, 0,                  // 62, 63
   1, 7, 0,                     // 66
   1, 3, 0,                     // 69
   8, 4, 2, 6, 0,               // 72  B
   4, 6, 0,                     // 77
   8, 6, 0,                     // 80, 81
   8, 2, 0,                     // 83, 84
   4, 2, 0,                     // 86
   4, 0,                        // 89
   8, 0,                        // 91
};
The slides[] table is a bit bigger than really needed, as smarter orddering of the elements in the lists would make it possible to use the tail of longer lists for the shorter ones. (Which now is done in only 5 places.) I did not want to mess with the order, however, to make sure the captures will be generated in exactly the same order as before. So the node counts stay exactly the same. That makes it easier to detect bugs, to which this method is rather prone. So I just deleted the diections that slam into an edge from the longest list (used for the central squares). The array slides[] isn't very big anyway.

Of course this technique can also be used for the leapers, and for the non-captures. But the number of nodes with non-capture generation is too small to give us worthwhile gains at this stage. The leapers I will try next. But just implementing it for the sliders like above gave another 5% speedup, bringing us to 6 Mnps:

Code: Select all

 1  -16       725 0 e2a6 b4c3 b2c3 e6d5
 2  -16      9497 0 e2a6 b4c3 b2c3 e6d5
 3  -27     12797 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 4  -27     71479 0 e2a6 e6d5 g2h3 b4c3 d2c3 d5e4
 5  -37    149317 0 e2a6 e6d5 c3d5 b6d5 e4d5 e7e5
 6  -32    399081 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 7  -32    838855 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
 8  -32   4710223 0 e2a6 e6d5 c3d5 b6d5 a6b7 a8b8 b7d5 f6d5 e5f7 e7f7 e4d5 g7b2 f3f7 e8f7
t =  0.780 sec
   4710223 nodes (6.0 Mnps)
   4409258 QS (93.6%)
    905777 evals (20.5%)
    350015 stand-pats (7.9%)
   1239812 leaves (28.1%)
   4134586 move gens
captures: 94.1%
User avatar
hgm
Posts: 28353
Joined: Fri Mar 10, 2006 10:06 am
Location: Amsterdam
Full name: H G Muller

Re: The mailbox trials

Post by hgm »

Strange enough a similar trick for the leapers just slows the thing down.