>>>> "Marcus" == Marcus Crestani
<crestani(a)informatik.uni-tuebingen.de> writes:
>>>>> "JR" == Jan Rychter <jan(a)rychter.com> writes: 
JR> I'll probably send the patch to Marcus Crestani, as it isn't
 JR> production-ready yet, it's closer to a proof-of-concept.
 Marcus> You should definitely send me your patch, improvements are
 Marcus> welcome!
Hi Marcus,
Well, I was really hoping to finish the work, but it seems I will not
have any time to work on this during the next few weeks. So, I'm sending
you what I have right now, at least you'll know where I've been going
with this work.
The diff is attached at the end of the E-mail. It works for me and is
and pretty well tested on my machine. It makes my XEmacs about 5% faster
overall on byte-compilation tasks. It is *not* production-ready, though,
and needs work.
The speedup comes from three things:
1. Processing bits 32 at a time.
2. Replacing the integer division in get_mark_bit_index by a multiply,
   two lookups and a shift. Surprisingly, on a P4, the special
   bsf/bsr instruction is slower than a lookup.
3. The gc_checking_assert in get_mark_bit really hurts. I'd suggest that
   it be removed as soon as we know it doesn't get hit under normal
   circumstances.
Caveats:
The patch will only work on 32-bit architectures, since it makes use of
wraparound in integer multiply.
It will only work for one mark bit per object. Do you actually plan to
use more? We will likely lose a lot of the efficiency if we
conditionalize this.
After the optimizations, the worst performance loss occurs in
kkcc_marking, and that's because of pipeline stalls due to mispredicted
branches. I don't know what we could do about this, other than rewrite
the switch as a series of conditionals ordered so that branch prediction
can do a better job.
--J.
Index: mc-alloc.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mc-alloc.c,v
retrieving revision 1.5
diff -u -3 -p -u -r1.5 mc-alloc.c
--- mc-alloc.c	2005/10/14 01:22:01	1.5
+++ mc-alloc.c	2005/11/23 11:13:23
@@ -581,15 +581,72 @@ get_page_header (void *ptr)
 }
 
 
+
+/* We use an algorithm for division by constants. This is taken from
+   "Hacker's Delight". --Jan Rychter <jan(a)rychter.com> */
+
+/* minv_table is used to hold the multiplier constants, while
+   shift_table holds the shift amounts for even dividers --jwr */
+unsigned int *minv_table;
+unsigned char *shift_table;
+
+unsigned mulinv_even(unsigned d) {           // d must be odd. 
+  unsigned x1, v1, x2, v2, x3, v3, q; 
+
+  x1 = 0xFFFFFFFF;     v1 = -d; 
+  x2 = 1;              v2 = d; 
+  while (v2 != 0) {
+    q = v1/v2; 
+    x3 = x1 - q*x2;   v3 = v1 - q*v2; 
+    x1 = x2;          v1 = v2; 
+    x2 = x3;          v2 = v3; 
+  } 
+  return(x1); 
+}
+
+
+/* Theoretically one might get an improvement by aligning minv_table and shift_table
+   on cache line boundaries --jwr */
+void compute_mulinv_table(unsigned int start, 
+			  unsigned int end, 
+			  unsigned int step) {
+  unsigned int minv, shiftamount, i, div;
+  minv_table = (unsigned int*)(malloc(sizeof(unsigned int) * (end + step)));
+  shift_table = (unsigned char*)(malloc(sizeof(unsigned char) * (end + step)));
+  for(i = start; i <= end; i += step) {
+    minv = mulinv_even(i);
+    minv_table[i] = minv;
+    div = i;
+    shiftamount = 0;
+    while(~div & 1) { div >>= 1; shiftamount++; };
+    shift_table[i] = (unsigned char)shiftamount;
+  }
+  minv_table[0] = 0;
+  shift_table[0] = 0;
+}
+
+/* Surprisingly enough, using this assembly instruction is slower on a
+   P4 than using a lookup table for the shifts. This might not be true
+   on other CPUs, for example bsf/bsr is known to be much faster on
+   Pentium-3 and Pentium-M. Benchmark! --jwr */
+static inline unsigned int trailing_0_bits(unsigned int word) {
+  __asm__("bsf %1,%0":"=r" (word):"r" (word));
+  return word;
+}
+
+static inline unsigned int mulinv_divide(unsigned int a, unsigned int b) {
+#if 1
+   return (unsigned int)(a * minv_table[b]) >> shift_table[b];
+#else
+   return (unsigned int)(a * minv_table[b]) >> trailing_0_bits(b);
+#endif
+}
+
 /* Returns the mark bit index of a given heap address. */
 static EMACS_INT
 get_mark_bit_index (void *ptr, page_header *ph)
 {
-  EMACS_INT cell_size = PH_CELL_SIZE (ph);
-  if (cell_size)
-    return (((EMACS_INT) ptr - (EMACS_INT)(PH_HEAP_SPACE (ph))) / cell_size);
-  else /* only one object on page */
-    return 0;
+  return mulinv_divide(((EMACS_INT) ptr - (EMACS_INT)(PH_HEAP_SPACE (ph))), PH_CELL_SIZE
(ph));
 }
 
 
@@ -664,6 +721,14 @@ get_bit (char *bit_array, EMACS_INT pos)
 }
 
 
+/* Returns 32 bits at pos. */
+static EMACS_INT
+get_32bits (char *bit_array, EMACS_INT pos)
+{
+  return (*(unsigned int*)(bit_array + (pos>>3)));
+}
+
+
 /* Bit_Arrays bit at pos to val. */
 static void
 set_bit(char *bit_array, EMACS_INT pos, EMACS_INT val)
@@ -730,6 +795,13 @@ do {						\
     ZERO_MARK_BITS_WORD (ph);			\
 } while (0)
 
+#define GET_32BITS(bit, ph, p)			\
+do {						\
+  if (USE_PNTR_MARK_BITS (ph))			\
+    bit = get_32bits(PH_MARK_BITS (ph), p);	\
+  else						\
+    bit = PH_MARK_BITS (ph);	\
+} while (0)
 
 /* Allocates mark-bit space either from a free list or from the OS
    for the given page header. */
@@ -801,11 +873,8 @@ get_mark_bit (void *ptr)
 {
   EMACS_INT bit = 0;
   page_header *ph = get_page_header (ptr);
-  gc_checking_assert (ph && PH_ON_USED_LIST_P (ph));
-  if (ph)
-    {
-      GET_BIT (bit, ph, get_mark_bit_index (ptr, ph));
-    }
+  //gc_checking_assert (ph && PH_ON_USED_LIST_P (ph));
+  GET_BIT (bit, ph, get_mark_bit_index (ptr, ph));
   return bit;
 }
 
@@ -815,11 +884,8 @@ void
 set_mark_bit (void *ptr, EMACS_INT value)
 {
   page_header *ph = get_page_header (ptr);
-  assert (ph && PH_ON_USED_LIST_P (ph));
-  if (ph)
-    {
-      SET_BIT (ph, get_mark_bit_index (ptr, ph), value);
-    }
+  //assert (ph && PH_ON_USED_LIST_P (ph));
+  SET_BIT (ph, get_mark_bit_index (ptr, ph), value);
 }
 
 
@@ -1529,22 +1595,35 @@ finalize_page (page_header *ph)
   EMACS_INT heap_space_step = PH_CELL_SIZE (ph);
   EMACS_INT mark_bit = 0;
   EMACS_INT mark_bit_max_index = PH_CELLS_ON_PAGE (ph);
-  int bit = 0;
+  unsigned int bit = 0;
+  unsigned int bit_words;
+  int i, j;
 
   mark_free_list (ph);
-
-  for (mark_bit = 0; mark_bit < mark_bit_max_index; mark_bit++)
-    {
-      GET_BIT (bit, ph, mark_bit);
-      if (!bit) 
-        {
-	  EMACS_INT ptr = (heap_space + (heap_space_step * mark_bit));
-	  MC_ALLOC_CALL_FINALIZER ((void *) ptr);
-        }
+  
+  bit_words = mark_bit_max_index >> 5;
+  
+  for(i=0; i < bit_words; i++) {
+    GET_32BITS (bit, ph, mark_bit);
+    for(j=0; j < 32; j++) {
+      if (!(bit&1)) {
+	EMACS_INT ptr = (heap_space + (heap_space_step * mark_bit));
+	MC_ALLOC_CALL_FINALIZER ((void *) ptr);
+      }
+      mark_bit++;
+      bit >>= 1;
+    }
+  }
+  while(mark_bit < mark_bit_max_index) {
+    GET_BIT (bit, ph, mark_bit);
+    if (!bit) {
+      EMACS_INT ptr = (heap_space + (heap_space_step * mark_bit));
+      MC_ALLOC_CALL_FINALIZER ((void *) ptr);
     }
+    mark_bit++;
+  }
 }
 
-
 /* Finalize a page for disksave. XEmacs calls this routine before it
    dumps the heap image. You have to tell mc-alloc how to call your
    object's finalizer for disksave. Therefore, you have to define the
@@ -1595,18 +1674,32 @@ sweep_page (page_header *ph)
   EMACS_INT heap_space_step = PH_CELL_SIZE (ph);
   EMACS_INT mark_bit = 0;
   EMACS_INT mark_bit_max_index = PH_CELLS_ON_PAGE (ph);
-  int bit = 0;
+  unsigned int bit = 0;
+  unsigned int bit_words;
+  int i, j;
 
   mark_free_list (ph);
 
-  for (mark_bit = 0; mark_bit < mark_bit_max_index; mark_bit++)
-    {
+  bit_words = mark_bit_max_index >> 5;
+
+  for(i=0; i < bit_words; i++) {
+    GET_32BITS (bit, ph, mark_bit);
+    for(j=0; j < 32; j++) {
+      if (!(bit&1)) {
+	remove_cell (heap_space + (heap_space_step * mark_bit), ph);
+      }
+      mark_bit++;
+      bit >>= 1;
+    }
+  }
+  while(mark_bit < mark_bit_max_index) {
       GET_BIT (bit, ph, mark_bit);
-      if (!bit) 
-	{
-	  remove_cell (heap_space + (heap_space_step * mark_bit), ph);
-	}
+      if (!bit) {
+	remove_cell (heap_space + (heap_space_step * mark_bit), ph);
+      }
+      mark_bit++;
     }
+
   zero_mark_bits (ph);
   if (PH_CELLS_USED (ph) == 0)
     remove_page_from_used_list (ph);
@@ -1750,6 +1843,9 @@ init_mc_allocator (void)
 #endif
 
   init_lookup_table ();
+  compute_mulinv_table(USED_LIST_MIN_OBJECT_SIZE,
+		       USED_LIST_UPPER_THRESHOLD,
+		       USED_LIST_LIN_STEP);
 }