compiler dependent scores!

Discussion of chess software programming and technical issues.

Moderator: Ras

jswaff

Some assembly

Post by jswaff »

Below I've posted some assembly, starting with the line "for (c=0;c<64;c++)". (in this version I'm declaring c and t at the top of Initialize() so I could compile it as a C program)

I'm wondering if the problem is the wrong target architecture. Notice the line at the top of the assembly - "Machine type IA32." I'm compiling on a Pentium 4. The compiler was downloaded/installed by Gentoo's portage system (and I do have -march=pentium4 in my system's default CCFLAGS).

Just out of curiousity, I compiled with -O2 -march=pentium4 and the program worked. With -O2 -march=i686 and -O2 -march=i386 it was busted.

--
James

Code: Select all

# -- Machine type IA32
# mark_description "Intel(R) C++ Compiler for applications running on IA-32, Version 10.0    Build 20070809 %s";
# mark_description "-O2 -S -c";
	.file "main.c"
	.text

<snip>

..B3.3:                         # Preds ..B3.21
        xorl      %edx, %edx                                    #34.7
        xorl      %eax, %eax                                    #34.7
                                # LOE eax edx
..B3.4:                         # Preds ..B3.8 ..B3.3
        movl      %eax, 4(%esp)                                 #36.8
        movl      %eax, (%esp)                                  #36.8
        movl      %edx, 16(%esp)                                #36.8
        xorl      %esi, %esi                                    #35.3
        xorl      %ebp, %ebp                                    #35.3
        xorl      %ebx, %ebx                                    #35.3
        xorl      %ecx, %ecx                                    #35.3
        movl      $8, %edi                                      #36.8
        movl      %edi, 8(%esp)                                 #36.8
        movl      %edi, %edx                                    #36.8
        lea       -448(%eax), %edi                              #36.8
        movl      4(%esp), %eax                                 #36.8
        movl      %edi, 12(%esp)                                #36.8
                                # LOE eax edx ecx ebx ebp esi
..B3.5:                         # Preds ..B3.6 ..B3.4
        movl      16(%esp), %edi                                #38.13
        subl      %edx, %edi                                    #38.13
        js        ..B3.18       # Prob 1%                       #38.16
                                # LOE eax edx ecx ebx ebp esi
..B3.6:                         # Preds ..B3.5
        orl       -64+bm_mask(%eax), %esi                       #39.4
        movl      %esi, %ebx                                    #39.4
        movl      12(%esp), %edi                                #36.14
        addl      $8, %edx                                      #36.16
        orl       -60+bm_mask(%eax), %ebp                       #39.4
        movl      %ebp, %ecx                                    #39.4
        addl      $-64, %eax                                    #36.16
        cmpl      %eax, %edi                                    #36.14
        jb        ..B3.5        # Prob 85%                      #36.14
                                # LOE eax edx ecx ebx ebp esi
..B3.7:                         # Preds ..B3.6
        movl      (%esp), %eax                                  #
        movl      16(%esp), %edx                                #
        movl      %esi, to_boundary(%eax)                       #39.4
        movl      %ebp, 4+to_boundary(%eax)                     #39.4
                                # LOE eax edx al dl ah dh
..B3.8:                         # Preds ..B3.7 ..B3.18
        addl      $8, %eax                                      #34.16
        addl      $1, %edx                                      #34.16
        cmpl      $64, %edx                                     #34.13
        jl        ..B3.4        # Prob 98%                      #34.13
                                # LOE eax edx
..B3.9:                         # Preds ..B3.8
        movl      288+to_boundary, %ecx                         #44.13
        movl      292+to_boundary, %edx                         #44.13
        xorl      %eax, %eax                                    #44.2
        movl      %eax, %ebp                                    #44.2
        movl      %edx, %esi                                    #44.2
        movl      %ecx, %edi                                    #44.2
                                # LOE ebx ebp esi edi
..B3.10:                        # Preds ..B3.14 ..B3.9
        movl      bm_mask(,%ebp,8), %edx                        #44.2
        movl      4+bm_mask(,%ebp,8), %eax                      #44.2
        andl      %edi, %edx                                    #44.2
        andl      %esi, %eax                                    #44.2
        orl       %eax, %edx                                    #44.2
        je        ..B3.12       # Prob 50%                      #44.2
                                # LOE ebx ebp esi edi
..B3.11:                        # Preds ..B3.10
        pushl     $_2__STRING.0.0                               #44.2
        call      printf                                        #44.2
        jmp       ..B3.27       # Prob 100%                     #44.2
                                # LOE ebx ebp esi edi
..B3.12:                        # Preds ..B3.10
        pushl     $_2__STRING.1.0                               #44.2
        call      printf                                        #44.2
                                # LOE ebx ebp esi edi
..B3.27:                        # Preds ..B3.11 ..B3.12
        popl      %ecx                                          #44.2
                                # LOE ebx ebp esi edi
..B3.13:                        # Preds ..B3.27
        movl      %ebp, %eax                                    #44.2
        andl      $-2147483641, %eax                            #44.2
        jge       ..B3.24       # Prob 50%                      #44.2
                                # LOE eax ebx ebp esi edi
..B3.25:                        # Preds ..B3.13
        decl      %eax                                          #44.2
        orl       $-8, %eax                                     #44.2
        incl      %eax                                          #44.2
                                # LOE eax ebx ebp esi edi
..B3.24:                        # Preds ..B3.13 ..B3.25
        cmpl      $7, %eax                                      #44.2
        je        ..B3.17       # Prob 5%                       #44.2
                                # LOE ebx ebp esi edi
..B3.14:                        # Preds ..B3.29 ..B3.24
        addl      $1, %ebp                                      #44.2
        cmpl      $64, %ebp                                     #44.2
        jl        ..B3.10       # Prob 98%                      #44.2
                                # LOE ebx ebp esi edi
..B3.15:                        # Preds ..B3.14
        pushl     $_2__STRING.2.0                               #44.2
        call      printf                                        #44.2
                                # LOE ebx ebp esi edi
..B3.16:                        # Preds ..B3.15
        addl      $24, %esp                                     #46.1
        popl      %ebx                                          #46.1
        popl      %ebp                                          #46.1
        popl      %esi                                          #46.1
        popl      %edi                                          #46.1
        ret                                                     #46.1
                                # LOE
..B3.17:                        # Preds ..B3.24                 # Infreq
        pushl     $_2__STRING.2.0                               #44.2
        call      printf                                        #44.2
                                # LOE ebx ebp esi edi
..B3.29:                        # Preds ..B3.17                 # Infreq
        popl      %ecx                                          #44.2
        jmp       ..B3.14       # Prob 100%                     #44.2
                                # LOE ebx ebp esi edi
..B3.18:                        # Preds ..B3.5                  # Infreq
        movl      (%esp), %eax                                  #
        movl      16(%esp), %edx                                #
        movl      %ebx, to_boundary(%eax)                       #39.4
        movl      %ecx, 4+to_boundary(%eax)                     #39.4
        jmp       ..B3.8        # Prob 100%                     #39.4
        .align    2,0x90
                                # LOE eax edx al dl ah dh
# mark_end;
	.type	Initialize,@function
	.size	Initialize,.-Initialize
	.data
# -- End  Initialize
	.section .rodata.str1.4, "aMS",@progbits,1
	.align 4
	.align 4
_2__STRING.1.0:
	.byte	48
	.byte	0
	.type	_2__STRING.1.0,@object
	.size	_2__STRING.1.0,2
	.space 2	# pad
_2__STRING.0.0:
	.byte	49
	.byte	0
	.type	_2__STRING.0.0,@object
	.size	_2__STRING.0.0,2
	.space 2	# pad
_2__STRING.2.0:
	.byte	10
	.byte	0
	.type	_2__STRING.2.0,@object
	.size	_2__STRING.2.0,2
	.data
	.comm bm_mask,512,32
	.comm to_boundary,512,32
	.section .note.GNU-stack, ""
# End
jswaff

Re: A complete and broken program

Post by jswaff »

Gerd Isenberg wrote:
jswaff wrote:

Code: Select all

        for (int c=0;c<64;c++) {
                to_boundary[c]=0;
                for (int t=1;t<8;t++) {
//                      if (c==E4) printf("considering: %d\n",t);
                        if (c-(t*8)<0) break;
                        to_boundary[c]|=bm_mask[c-(t*8)];
                }
        }
Just another try to avoid redundant repeat/break condition of the inner loop as a source of possible compiler confusion...

Code: Select all

   
   for (int c=0; c<64; c++) {
      to_boundary[c]=0;
      for (int t=8; t <= c; t += 8) {
         // if (c==E4) printf("considering: %d\n",t>>3);
         to_boundary[c] |= bm_mask[c-t];
      }
   }
I regocnized your big-endian rank mapping, thus the initialization code I posted before has to be adapted, since it relies on a1 == 0.
Your code works. I get a couple of remarks from the compiler about the partial loop being vectorized, but it works correctly.

That said, despite the fact that you don't like the break in the inner loop, it IS legal as far as I know. And it's more obvious (to me) what's going on.


--
James
bob
Posts: 20943
Joined: Mon Feb 27, 2006 7:30 pm
Location: Birmingham, AL

Re: A complete and broken program

Post by bob »

jswaff wrote:
Harald Johnsen wrote: Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
Shouldn't it though? The for-loop SHOULD be a sequential construct. If the compiler is optimizing in a way that isn't sequentially consistent with the unoptimized version, then that's a bug.

--
James
Yes, but it is also a bit sloppy to declare variables in a for statement. It often produces more problems because it can hide the variable you meant to use, etc. I declare everything at the top, explicitly making sure I know what I am doing...

BTW how the loop is executed is up to the compiler, so long as it doesn't violate normal programming rules. If each iteration is independent, it might be that the loop is completely unrolled. And then things re-ordered to give a better parallel instruction stream for super-scalar execution.
jswaff

Re: A complete and broken program

Post by jswaff »

bob wrote:
jswaff wrote:
Harald Johnsen wrote: Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
Shouldn't it though? The for-loop SHOULD be a sequential construct. If the compiler is optimizing in a way that isn't sequentially consistent with the unoptimized version, then that's a bug.

--
James
Yes, but it is also a bit sloppy to declare variables in a for statement. It often produces more problems because it can hide the variable you meant to use, etc. I declare everything at the top, explicitly making sure I know what I am doing...

BTW how the loop is executed is up to the compiler, so long as it doesn't violate normal programming rules. If each iteration is independent, it might be that the loop is completely unrolled. And then things re-ordered to give a better parallel instruction stream for super-scalar execution.
A bit sloppy?? It is a "best practice" to minimize the scope of variables.
It improves readability, and more importantly, limits visibility of the variable. True, in C, you *have* to declare local variables at the head of the block, but that is definitely a habit worth breaking.

I agree about the FOR loop. I assumed the loop was iterated sequentially t=1..7, so I guess that's the root problem.

--
James
jswaff

Re: A complete and broken program

Post by jswaff »

Harald Johnsen wrote:

Code: Select all

...
void Initialize() {

        Bitmap b=1;
        for (int c=0;c<64;c++)
                bm_mask[c]=(b<<c);


        for (int c=0;c<64;c++) {
                to_boundary[c]=0;
                for (int t=1;t<8;t++) {
                        if (!(c-(t*8)<0))
                                to_boundary[c]|=bm_mask[c-(t*8)];
                }
        }

        DrawBitmap(to_boundary[E4]);

}
Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
I see now... I think you nailed it. Thanks. :)

--
James
bob
Posts: 20943
Joined: Mon Feb 27, 2006 7:30 pm
Location: Birmingham, AL

Re: A complete and broken program

Post by bob »

jswaff wrote:
bob wrote:
jswaff wrote:
Harald Johnsen wrote: Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
Shouldn't it though? The for-loop SHOULD be a sequential construct. If the compiler is optimizing in a way that isn't sequentially consistent with the unoptimized version, then that's a bug.

--
James
Yes, but it is also a bit sloppy to declare variables in a for statement. It often produces more problems because it can hide the variable you meant to use, etc. I declare everything at the top, explicitly making sure I know what I am doing...

BTW how the loop is executed is up to the compiler, so long as it doesn't violate normal programming rules. If each iteration is independent, it might be that the loop is completely unrolled. And then things re-ordered to give a better parallel instruction stream for super-scalar execution.
A bit sloppy?? It is a "best practice" to minimize the scope of variables.
It improves readability, and more importantly, limits visibility of the variable. True, in C, you *have* to declare local variables at the head of the block, but that is definitely a habit worth breaking.

I agree about the FOR loop. I assumed the loop was iterated sequentially t=1..7, so I guess that's the root problem.

--
James
I have found it introduces as many bugs as it prevents. You think you are modifying "wtm" that is used somewhere above, yet in reality you are modifying a local wtm that is not visible outside this sloop.

I don't mind declaring variables inside loops, but in the for itself it is pretty well hidden, which I don't like.
wgarvin
Posts: 838
Joined: Thu Jul 05, 2007 5:03 pm
Location: British Columbia, Canada

Re: A complete and broken program

Post by wgarvin »

Harald Johnsen wrote:

Code: Select all

...
void Initialize() {

        Bitmap b=1;
        for (int c=0;c<64;c++)
                bm_mask[c]=(b<<c);


        for (int c=0;c<64;c++) {
                to_boundary[c]=0;
                for (int t=1;t<8;t++) {
                        if (!(c-(t*8)<0))
                                to_boundary[c]|=bm_mask[c-(t*8)];
                }
        }

        DrawBitmap(to_boundary[E4]);

}
Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
"Don't write the code that causes the compiler bug" is not really a solution. His source code is perfectly legit-looking. The compiler is generating code from it that doesn't work. It sure looks like a genuine compiler bug. All bets are off when you compile code with a buggy compiler.

If compiling it for -march=pentium4 works fine but -march=i686 does not work, then the problem is probably in the code generation for i386 and i686 in that compiler. Probably that mode does not get as much testing as pentium4 or something?

Anything that you change---removing break statements, commenting or not commenting this printf---that just changes the circumstances inside the compiler so that *maybe* it hides the bug. But fundamentally if the compiler can generate broken code some of the time, then you don't want to touch it with a 10-foot pole. This compiler apparently can do that when using -march=i686.

If using -march=pentium4 is the right thing anyway, and doesn't generate buggy code, then I'd say go ahead and use that.
jswaff

Re: A complete and broken program

Post by jswaff »

wgarvin wrote:
Harald Johnsen wrote:

Code: Select all

...
void Initialize() {

        Bitmap b=1;
        for (int c=0;c<64;c++)
                bm_mask[c]=(b<<c);


        for (int c=0;c<64;c++) {
                to_boundary[c]=0;
                for (int t=1;t<8;t++) {
                        if (!(c-(t*8)<0))
                                to_boundary[c]|=bm_mask[c-(t*8)];
                }
        }

        DrawBitmap(to_boundary[E4]);

}
Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
"Don't write the code that causes the compiler bug" is not really a solution. His source code is perfectly legit-looking. The compiler is generating code from it that doesn't work. It sure looks like a genuine compiler bug. All bets are off when you compile code with a buggy compiler.

If compiling it for -march=pentium4 works fine but -march=i686 does not work, then the problem is probably in the code generation for i386 and i686 in that compiler. Probably that mode does not get as much testing as pentium4 or something?

Anything that you change---removing break statements, commenting or not commenting this printf---that just changes the circumstances inside the compiler so that *maybe* it hides the bug. But fundamentally if the compiler can generate broken code some of the time, then you don't want to touch it with a 10-foot pole. This compiler apparently can do that when using -march=i686.

If using -march=pentium4 is the right thing anyway, and doesn't generate buggy code, then I'd say go ahead and use that.
I'm not so sure it's the compilers fault anymore. The question is: what guarantees do we have regarding the order of execution of the loop bodies? I'm sure the statements *within* the loop body have to be performed sequentially (or at least must be sequentially consistent), but does the C language specify that the blocks themselves must be executed in the "intuitive" order (as specified by the loop counter)?

Some languages differentiate between "for" and "forall", where "for" is guaranteed to be performed in order and "forall" is not.

--
James
bob
Posts: 20943
Joined: Mon Feb 27, 2006 7:30 pm
Location: Birmingham, AL

Re: A complete and broken program

Post by bob »

jswaff wrote:
wgarvin wrote:
Harald Johnsen wrote:

Code: Select all

...
void Initialize() {

        Bitmap b=1;
        for (int c=0;c<64;c++)
                bm_mask[c]=(b<<c);


        for (int c=0;c<64;c++) {
                to_boundary[c]=0;
                for (int t=1;t<8;t++) {
                        if (!(c-(t*8)<0))
                                to_boundary[c]|=bm_mask[c-(t*8)];
                }
        }

        DrawBitmap(to_boundary[E4]);

}
Don't do a break in the loop, the compiler does not execute the loop in the order you think (ie 1 to 8).

HJ.
"Don't write the code that causes the compiler bug" is not really a solution. His source code is perfectly legit-looking. The compiler is generating code from it that doesn't work. It sure looks like a genuine compiler bug. All bets are off when you compile code with a buggy compiler.

If compiling it for -march=pentium4 works fine but -march=i686 does not work, then the problem is probably in the code generation for i386 and i686 in that compiler. Probably that mode does not get as much testing as pentium4 or something?

Anything that you change---removing break statements, commenting or not commenting this printf---that just changes the circumstances inside the compiler so that *maybe* it hides the bug. But fundamentally if the compiler can generate broken code some of the time, then you don't want to touch it with a 10-foot pole. This compiler apparently can do that when using -march=i686.

If using -march=pentium4 is the right thing anyway, and doesn't generate buggy code, then I'd say go ahead and use that.
I'm not so sure it's the compilers fault anymore. The question is: what guarantees do we have regarding the order of execution of the loop bodies? I'm sure the statements *within* the loop body have to be performed sequentially (or at least must be sequentially consistent), but does the C language specify that the blocks themselves must be executed in the "intuitive" order (as specified by the loop counter)?

Some languages differentiate between "for" and "forall", where "for" is guaranteed to be performed in order and "forall" is not.

--
James
Every good compiler builds a dependency graph before it starts to muck around with loops. And then they ask the question "for all loop iterations i, does any single iteration i[n] depend on the results of any previous iteration completing first?" If the answer is no, then it does not matter whether the loop is executed first-to-last, last-to-first, or in completely random order.

The dependency graph will detect inter-loop dependencies and prevent this from occurring if it would cause a problem.
bob
Posts: 20943
Joined: Mon Feb 27, 2006 7:30 pm
Location: Birmingham, AL

Re: A complete and broken program

Post by bob »

I didn't have time to completely respond earlier. But here is a question:

Here are two pieces of code that do the _identically_ same computations:

Code: Select all

int EvaluateKingsFile(TREE * RESTRICT tree, int whichfile, int side)
{
  register int defects = 0, file;
  register int enemy = Flip(side);
  static const int a2a7[2] = { A7, A2 }; 
  static const int a3a6[2] = { A6, A3 };
    
  for (file = whichfile - 1; file <= whichfile + 1; file++) {
    if (!(file_mask[file] & tree->all_pawns))
      defects += open_file[file];
    else {
      if (!(file_mask[file] & Pawns(enemy)))
        defects += half_open_file[file] / 2;
      else
        defects +=
            pawn_defects[side][Rank(Advanced(enemy,
                    file_mask[file] & Pawns(enemy)))];
      if (!(file_mask[file] & Pawns(side)))
        defects += half_open_file[file];
      else {
        if (!(Pawns(side) & SetMask(a2a7[side] + file))) {
          defects++;
          if (!(Pawns(side) & SetMask(a3a6[side] + file)))
            defects++;
        }
      }
    }
  }
  return (defects);
}


and then

Code: Select all

int EvaluateKingsFile(TREE * RESTRICT tree, int whichfile, int side) { register int defects = 0, file; register int enemy = Flip(side); static const int a2a7[2] = { A7, A2 }; static const int a3a6[2] = { A6, A3 }; for (file = whichfile - 1; file <= whichfile + 1; file++) { if (!(file_mask[file] & tree->all_pawns)) defects += open_file[file]; else { if (!(file_mask[file] & Pawns(enemy))) defects += half_open_file[file] / 2; else defects += pawn_defects[side][Rank(Advanced(enemy, file_mask[file] & Pawns(enemy)))]; if (!(file_mask[file] & Pawns(side))) defects += half_open_file[file]; else { if (!(Pawns(side) & SetMask(a2a7[side] + file))) { defects++; if (!(Pawns(side) & SetMask(a3a6[side] + file))) defects++; } } } } return (defects); }


Which one is easier to read? that is what I don't like about the

for (int i=0; i<n; i++)

syntax. that "int i" is buried inside a common statement and it is easy to overlook the fact that i is local to the loop only, and if you modify i inside the loop, or break out of the loop, the value of i that is left is not the one from the loop counter, it is the i that has greater scope.

Some compilers will complain, some will not, when that happens, just like some complain when a formal parameter shadows a global variable. I think that it makes it easier to explicitly declare where it is visible, rather than using declarations that are difficult to pick out. That was my point... anything to eliminate bugs is good. That's why we don't cram everything onto one line...