Puzzle: Fast Bit Counting

 Explaination and Analysis

  1.  
  2. /*
  3.    Bit Counting routines
  4.  
  5.    Author: Gurmeet Singh Manku    (manku@cs.stanford.edu)
  6.    Date:   27 Aug 2002
  7.   */
  8.  
  9.  
  10. #include <stdlib.h>
  11. #include <stdio.h>
  12. #include <limits.h>
  13.  
  14. /* Iterated bitcount iterates over each bit. The while condition sometimes helps
  15.    terminates the loop earlier */
  16. int iterated_bitcount (unsigned int n)
  17. {
  18.     int count=0;   
  19.     while (n)
  20.     {
  21.         count += n & 0x1u ;   
  22.         n >>= 1 ;
  23.     }
  24.     return count ;
  25. }
  26.  
  27.  
  28. /* Sparse Ones runs proportional to the number of ones in n.
  29.    The line   n &= (n-1)   simply sets the last 1 bit in n to zero. */
  30. int sparse_ones_bitcount (unsigned int n)
  31. {
  32.     int count=0 ;
  33.     while (n)
  34.     {
  35.         count++ ;
  36.         n &= (n - 1) ;     
  37.     }
  38.     return count ;
  39. }
  40.  
  41.  
  42. /* Dense Ones runs proportional to the number of zeros in n.
  43.    It first toggles all bits in n, then diminishes count repeatedly */
  44. int dense_ones_bitcount (unsigned int n)
  45. {
  46.     int count = 8 * sizeof(int) ;   
  47.     n ^= (unsigned int) -1 ;
  48.     while (n)
  49.     {
  50.         count-- ;
  51.         n &= (n - 1) ;   
  52.     }
  53.     return count ;
  54. }
  55.  
  56.  
  57. /* Precomputed bitcount uses a precomputed array that stores the number of ones
  58.    in each char. */
  59. static int bits_in_char [256] ;
  60.  
  61. void compute_bits_in_char (void)
  62. {
  63.     unsigned int i ;   
  64.     for (i = 0; i < 256; i++)
  65.         bits_in_char [i] = iterated_bitcount (i) ;
  66.     return ;
  67. }
  68.  
  69. int precomputed_bitcount (unsigned int n)
  70. {
  71.     // works only for 32-bit ints
  72.    
  73.     return bits_in_char [n         & 0xffu]
  74.         +  bits_in_char [(n >>  8) & 0xffu]
  75.         +  bits_in_char [(n >> 16) & 0xffu]
  76.         +  bits_in_char [(n >> 24) & 0xffu] ;
  77. }
  78.  
  79.  
  80. /* Here is another version of precomputed bitcount that uses a precomputed array
  81.    that stores the number of ones in each short. */
  82.  
  83. static char bits_in_16bits [0x1u << 16] ;
  84.  
  85. void compute_bits_in_16bits (void)
  86. {
  87.     unsigned int i ;   
  88.     for (i = 0; i < (0x1u<<16); i++)
  89.         bits_in_16bits [i] = iterated_bitcount (i) ;
  90.     return ;
  91. }
  92.  
  93. int precomputed16_bitcount (unsigned int n)
  94. {
  95.     // works only for 32-bit int
  96.    
  97.     return bits_in_16bits [n         & 0xffffu]
  98.         +  bits_in_16bits [(n >> 16) & 0xffffu] ;
  99. }
  100.  
  101.  
  102. /* Parallel   Count   carries   out    bit   counting   in   a   parallel
  103.    fashion.   Consider   n   after    the   first   line   has   finished
  104.    executing. Imagine splitting n into  pairs of bits. Each pair contains
  105.    the <em>number of ones</em> in those two bit positions in the original
  106.    n.  After the second line has finished executing, each nibble contains
  107.    the  <em>number of  ones</em>  in  those four  bits  positions in  the
  108.    original n. Continuing  this for five iterations, the  64 bits contain
  109.    the  number  of ones  among  these  sixty-four  bit positions  in  the
  110.    original n. That is what we wanted to compute. */
  111.  
  112. #define TWO(c) (0x1u << (c))
  113. #define MASK(c) (((unsigned int)(-1)) / (TWO(TWO(c)) + 1u))
  114. #define COUNT(x,c) ((x) & MASK(c)) + (((x) >> (TWO(c))) & MASK(c))
  115.  
  116. int parallel_bitcount (unsigned int n)
  117. {
  118.     n = COUNT(n, 0) ;
  119.     n = COUNT(n, 1) ;
  120.     n = COUNT(n, 2) ;
  121.     n = COUNT(n, 3) ;
  122.     n = COUNT(n, 4) ;
  123.     /* n = COUNT(n, 5) ;    for 64-bit integers */
  124.     return n ;
  125. }
  126.  
  127.  
  128. /* Nifty  Parallel Count works  the same  way as  Parallel Count  for the
  129.    first three iterations. At the end  of the third line (just before the
  130.    return), each byte of n contains the number of ones in those eight bit
  131.    positions in  the original n. A  little thought then  explains why the
  132.    remainder modulo 255 works. */
  133.  
  134. #define MASK_01010101 (((unsigned int)(-1))/3)
  135. #define MASK_00110011 (((unsigned int)(-1))/5)
  136. #define MASK_00001111 (((unsigned int)(-1))/17)
  137.  
  138. int nifty_bitcount (unsigned int n)
  139. {
  140.     n = (n & MASK_01010101) + ((n >> 1) & MASK_01010101) ;
  141.     n = (n & MASK_00110011) + ((n >> 2) & MASK_00110011) ;
  142.     n = (n & MASK_00001111) + ((n >> 4) & MASK_00001111) ;       
  143.     return n % 255 ;
  144. }
  145.  
  146. /* MIT Bitcount
  147.  
  148.    Consider a 3 bit number as being
  149.         4a+2b+c
  150.    if we shift it right 1 bit, we have
  151.         2a+b
  152.   subtracting this from the original gives
  153.         2a+b+c
  154.   if we shift the original 2 bits right we get
  155.         a
  156.   and so with another subtraction we have
  157.         a+b+c
  158.   which is the number of bits in the original number.
  159.  
  160.   Suitable masking  allows the sums of  the octal digits  in a 32 bit  number to
  161.   appear in  each octal digit.  This  isn't much help  unless we can get  all of
  162.   them summed together.   This can be done by modulo  arithmetic (sum the digits
  163.   in a number by  molulo the base of the number minus  one) the old "casting out
  164.   nines" trick  they taught  in school before  calculators were  invented.  Now,
  165.   using mod 7 wont help us, because our number will very likely have more than 7
  166.   bits set.   So add  the octal digits  together to  get base64 digits,  and use
  167.   modulo 63.   (Those of you  with 64  bit machines need  to add 3  octal digits
  168.   together to get base512 digits, and use mod 511.)
  169.  
  170.   This is HACKMEM 169, as used in X11 sources.
  171.   Source: MIT AI Lab memo, late 1970's.
  172. */
  173.  
  174. int mit_bitcount(unsigned int n)
  175. {
  176.     /* works for 32-bit numbers only */
  177.     register unsigned int tmp;
  178.    
  179.     tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
  180.     return ((tmp + (tmp >> 3)) & 030707070707) % 63;
  181. }
  182.  
  183. void verify_bitcounts (unsigned int x)
  184. {
  185.     int iterated_ones, sparse_ones, dense_ones ;
  186.     int precomputed_ones, precomputed16_ones ;
  187.     int parallel_ones, nifty_ones ;
  188.     int mit_ones ;
  189.    
  190.     iterated_ones      = iterated_bitcount      (x) ;
  191.     sparse_ones        = sparse_ones_bitcount   (x) ;
  192.     dense_ones         = dense_ones_bitcount    (x) ;
  193.     precomputed_ones   = precomputed_bitcount   (x) ;
  194.     precomputed16_ones = precomputed16_bitcount (x) ;
  195.     parallel_ones      = parallel_bitcount      (x) ;
  196.     nifty_ones         = nifty_bitcount         (x) ;
  197.     mit_ones           = mit_bitcount           (x) ;
  198.  
  199.     if (iterated_ones != sparse_ones)
  200.     {
  201.         printf ("ERROR: sparse_bitcount (0x%x) not okay!\n", x) ;
  202.         exit (0) ;
  203.     }
  204.    
  205.     if (iterated_ones != dense_ones)
  206.     {
  207.         printf ("ERROR: dense_bitcount (0x%x) not okay!\n", x) ;
  208.         exit (0) ;
  209.     }
  210.  
  211.     if (iterated_ones != precomputed_ones)
  212.     {
  213.         printf ("ERROR: precomputed_bitcount (0x%x) not okay!\n", x) ;
  214.         exit (0) ;
  215.     }
  216.        
  217.     if (iterated_ones != precomputed16_ones)
  218.     {
  219.         printf ("ERROR: precomputed16_bitcount (0x%x) not okay!\n", x) ;
  220.         exit (0) ;
  221.     }
  222.    
  223.     if (iterated_ones != parallel_ones)
  224.     {
  225.         printf ("ERROR: parallel_bitcount (0x%x) not okay!\n", x) ;
  226.         exit (0) ;
  227.     }
  228.  
  229.     if (iterated_ones != nifty_ones)
  230.     {
  231.         printf ("ERROR: nifty_bitcount (0x%x) not okay!\n", x) ;
  232.         exit (0) ;
  233.     }
  234.  
  235.     if (mit_ones != nifty_ones)
  236.     {
  237.         printf ("ERROR: mit_bitcount (0x%x) not okay!\n", x) ;
  238.         exit (0) ;
  239.     }
  240.    
  241.     return ;
  242. }
  243.  
  244.  
  245. int main (void)
  246. {
  247.     int i ;
  248.    
  249.     compute_bits_in_char () ;
  250.     compute_bits_in_16bits () ;
  251.  
  252.     verify_bitcounts (UINT_MAX) ;
  253.     verify_bitcounts (0) ;
  254.    
  255.     for (i = 0 ; i < 100000 ; i++)
  256.         verify_bitcounts (lrand48 ()) ;
  257.    
  258.     printf ("All bitcounts seem okay!\n") ;
  259.    
  260.     return 0 ;
  261. }