Speed comparison of five routines: my original proposal, Steffan's proposal (but with a bsf at the end to get the index), Marco's proposal, and a combination of all three. I also tested Steffan's without the bitscan, and with a bb &= ~s; in the loop.
Code: Select all
int bitscan_pri_z(bb_t bb)
{
int bit, res = 0;
bb_t r;
/* Unroll this! */
for (bit = 5; bit >= 0; bit--) {
r = (bb & pri_mask[bit]) != 0;
bb &= pri_mask[bit] | (r - 1);
res = res << 1 | r;
}
return rev_priority[res];
}
bb_t bitscan_pri_so(bb_t bb)
{
int bit;
bb_t r;
/* Unroll this! */
for (bit = 5; bit >= 0; bit--) {
r = (bb & pri_mask[bit]) != 0;
bb &= pri_mask[bit] | (r - 1);
}
return (bb);
}
int bitscan_pri_s(bb_t bb)
{
int bit;
bb_t r;
/* Unroll this! */
for (bit = 5; bit >= 0; bit--) {
r = (bb & pri_mask[bit]) != 0;
bb &= pri_mask[bit] | (r - 1);
}
return bsfq(bb);
}
int bitscan_pri_m(bb_t bb)
{
int bit, res = 0, r;
/* Unroll this! */
for (bit = 5; bit >= 0; bit--) {
r = (bb & pri_mask_a[bit][1]) != 0;
bb &= pri_mask_a[bit][r];
res = res << 1 | r;
}
return rev_priority[res];
}
int bitscan_pri_sm(bb_t bb)
{
int bit, r;
/* Unroll this! */
for (bit = 5; bit >= 0; bit--) {
r = (bb & pri_mask_a[bit][1]) != 0;
bb &= pri_mask_a[bit][r];
}
return bsfq(bb);
}
Each one was measured in a dumb loop:
Code: Select all
t = clock();
for (i = 0; i < 10000000; i++) {
bb_t b = -1;
x = 0;
while (b) {
b &= ~(1ull << x);
x++;
}
}
int overhead = clock() - t;
int time;
t = clock();
for (i = 0; i < 10000000; i++) {
bb_t b = -1;
while (b) {
int s = bitscan_pri_z(b);
b &= ~(1ull << s);
}
}
time = clock() - t - overhead;
printf("zach: overhead=%i, time=%i\n",overhead, time);
And the results (in centiseconds):
Code: Select all
zach: overhead=54, time=1037
steffan orig: overhead=54, time=873
steffan: overhead=54, time=903
marco: overhead=54, time=1389
s/m: overhead=54, time=1769
Surprisingly, the bsfq in Steffan's is very cheap, and the combination of Steffan and Marco's proposals was much slower than the original. Steffan looks like the winner here though.