carbon2-commit: Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.

Saturday, 5 April 2008


        changeset:   4426:4ee73bbe4f8e814fb02c11cb586a6ea3e539e720
parent:      4366:cc293ef846d240af187a523bb32eb5e26a083531
user:        Aidan Kehoe <kehoea(a)parhasard.net&gt;
date:        Wed Dec 26 17:30:16 2007 +0100
files:       src/ChangeLog src/casetab.c src/search.c
description:
Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.

2007-12-26  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* casetab.c:
	Extend and correct some case table documentation.
	* search.c (search_buffer):
	Correct a bug where only the first entry for a character in the
	case equivalence table was examined in determining if the
	Boyer-Moore search algorithm is appropriate.

	If there are case mappings outside of the charset and row of the
	characters specified in the search string, those case mappings can
	be safely ignored (and Boyer-Moore search can be used) if we know
	from the buffer statistics that the corresponding characters cannot
	occur.

	* search.c (boyer_moore):
	Assert that we haven't been passed a string with varying
	characters sets or rows within character sets. That's what
	simple_search is for.

	In the very rare event that a character in the search string has a
	canonical case mapping that is not in the same character set and
	row, don't try to search for the canonical character, search for
	some other character that is in the the desired character set and
	row. Assert that the case table isn't corrupt.

	Do not search for any character case mappings that cannot possibly
	occur in the buffer, given the buffer metadata about its
	contents.


diff -r cc293ef846d240af187a523bb32eb5e26a083531 -r
4ee73bbe4f8e814fb02c11cb586a6ea3e539e720 src/ChangeLog
--- a/src/ChangeLog	Mon Dec 24 14:00:51 2007 +0100
+++ b/src/ChangeLog	Wed Dec 26 17:30:16 2007 +0100
＠＠ -1,3 +1,33 ＠＠ 2007-12-24  Aidan Kehoe  <kehoea＠parhasa
+2007-12-26  Aidan Kehoe  <kehoea(a)parhasard.net&gt;
+
+	* casetab.c:
+	Extend and correct some case table documentation. 
+	* search.c (search_buffer):
+	Correct a bug where only the first entry for a character in the
+	case equivalence table was examined in determining if the
+	Boyer-Moore search algorithm is appropriate.
+
+	If there are case mappings outside of the charset and row of the
+	characters specified in the search string, those case mappings can
+	be safely ignored (and Boyer-Moore search can be used) if we know
+	from the buffer statistics that the corresponding characters cannot
+	occur.
+
+	* search.c (boyer_moore):
+	Assert that we haven't been passed a string with varying
+	characters sets or rows within character sets. That's what
+	simple_search is for. 
+
+	In the very rare event that a character in the search string has a
+	canonical case mapping that is not in the same character set and
+	row, don't try to search for the canonical character, search for
+	some other character that is in the the desired character set and
+	row. Assert that the case table isn't corrupt.
+
+	Do not search for any character case mappings that cannot possibly
+	occur in the buffer, given the buffer metadata about its
+	contents. 
+
 2007-12-24  Aidan Kehoe  <kehoea(a)parhasard.net&gt;
 
 	* symbols.c (Fintern_soft): 
diff -r cc293ef846d240af187a523bb32eb5e26a083531 -r
4ee73bbe4f8e814fb02c11cb586a6ea3e539e720 src/casetab.c
--- a/src/casetab.c	Mon Dec 24 14:00:51 2007 +0100
+++ b/src/casetab.c	Wed Dec 26 17:30:16 2007 +0100
＠＠ -48,13 +48,28 ＠＠ Boston, MA 02111-1307, USA.  */
    or vice versa, both characters will have the same entry in the canon
    table.
 
-   (4) `equiv' lists the "equivalence classes" defined by `canon'. 
Imagine
+   (4) `eqv' lists the "equivalence classes" defined by `canon'. 
Imagine
    that all characters are divided into groups having the same `canon'
-   entry; these groups are called "equivalence classes" and `equiv' lists
-   them by linking the characters in each equivalence class together in a
-   circular list.
-
-   `canon' is used when doing case-insensitive comparisons.  `equiv' is
+   entry; these groups are called "equivalence classes" and `eqv' lists
them
+   by linking the characters in each equivalence class together in a
+   circular list. That is, to find out all all the members of a given char's
+   equivalence classe, you need something like the following code:
+
+    (let* ((char ?i)
+           (original-char char)
+           (standard-case-eqv (case-table-eqv (standard-case-table))))
+      (loop
+        with res = (list char)
+        until (eq (setq char (get-char-table char standard-case-eqv))
+                  original-char)
+        do (push char res)
+        finally return res))
+
+   (Where #'case-table-eqv doesn't yet exist, and probably never will, given
+   that the C code needs to keep it in a consistent state so Lisp can't mess
+   around with it.)
+
+   `canon' is used when doing case-insensitive comparisons.  `eqv' is
    used in the Boyer-Moore search code.
    */
 
diff -r cc293ef846d240af187a523bb32eb5e26a083531 -r
4ee73bbe4f8e814fb02c11cb586a6ea3e539e720 src/search.c
--- a/src/search.c	Mon Dec 24 14:00:51 2007 +0100
+++ b/src/search.c	Wed Dec 26 17:30:16 2007 +0100
＠＠ -1340,11 +1340,14 ＠＠ search_buffer (struct buffer *buf, Lisp_
     {
       int charset_base = -1;
       int boyer_moore_ok = 1;
-      Ibyte *pat = 0;
       Ibyte *patbuf = alloca_ibytes (len * MAX_ICHAR_LEN);
-      pat = patbuf;
+      Ibyte *pat = patbuf;
+
 #ifdef MULE
-      /* &&#### needs some 8-bit work here */
+      int entirely_one_byte_p = buf->text->entirely_one_byte_p;
+      int nothing_greater_than_0xff =
+        buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf);
+
       while (len > 0)
 	{
 	  Ibyte tmp_str[MAX_ICHAR_LEN];
＠＠ -1367,23 +1370,68 ＠＠ search_buffer (struct buffer *buf, Lisp_
 	  inv_bytelen = set_itext_ichar (tmp_str, inverse);
 	  new_bytelen = set_itext_ichar (tmp_str, translated);
 
-	  if (new_bytelen != orig_bytelen || inv_bytelen != orig_bytelen)
-	    boyer_moore_ok = 0;
-	  if (translated != c || inverse != c)
-	    {
-	      /* Keep track of which charset and character set row
-		 contains the characters that need translation.
-		 Zero out the bits corresponding to the last byte.
-	      */
-	      int charset_base_code = c & ~ICHAR_FIELD3_MASK;
-	      if (charset_base == -1)
-		charset_base = charset_base_code;
-	      else if (charset_base != charset_base_code)
-		/* If two different rows appear, needing translation, then
-		   we cannot use boyer_moore search.  See the comment at the
-		   head of boyer_moore(). */
-		boyer_moore_ok = 0;
-	    }
+          if (-1 == charset_base)
+            {
+              /* Keep track of which charset and character set row
+                 contains the characters that need translation.
+
+                 Zero out the bits corresponding to the last byte. */
+              charset_base = c & ~ICHAR_FIELD3_MASK;
+            }
+
+          if (boyer_moore_ok && (translated != c || inverse != c))
+            {
+	      Ichar starting_c = c;
+	      int charset_base_code;
+
+	      do 
+		{
+		  c = TRANSLATE (inverse_trt, c);
+
+                  /* If a character cannot occur in the buffer, ignore
+                     it. */
+                  if (c > 0x7F && entirely_one_byte_p)
+                    continue;
+
+                  if (c > 0xFF && nothing_greater_than_0xff)
+                    continue;
+
+                  charset_base_code = c & ~ICHAR_FIELD3_MASK;
+
+                  if (charset_base_code != charset_base)
+                    {
+                      /* If two different rows, or two different charsets,
+                         appear, needing translation, then we cannot use
+                         boyer_moore search.  See the comment at the head of
+                         boyer_moore(). */
+                      boyer_moore_ok = 0;
+                      break;
+                    }
+                } while (c != starting_c);
+
+              if (boyer_moore_ok && (charset_base != 
+                                     (translated & ~ICHAR_FIELD3_MASK)))
+                {
+                  /* In the rare event that the CANON entry for this
+                     character is not in the desired set, choose one
+                     that is, from the equivalence set. It doesn't much
+                     matter which. */
+                  Ichar starting_ch = translated;
+                  do
+                    {
+                      translated = TRANSLATE (inverse_trt, translated);
+
+                      if (charset_base == (translated & ~ICHAR_FIELD3_MASK))
+                        break;
+
+                    } while (starting_ch != translated);
+
+                  assert (starting_ch != translated);
+
+                  new_bytelen = set_itext_ichar (tmp_str, translated);
+                }
+            }
+
 	  memcpy (pat, tmp_str, new_bytelen);
 	  pat += new_bytelen;
 	  base_pat += orig_bytelen;
＠＠ -1551,14 +1599,14 ＠＠ simple_search (struct buffer *buf, Ibyte
    makes it possible to translate just the last byte of a character,
    and do so after just a simple test of the context.
 
-   If that criterion is not satisfied, do not call this function.  */
+   If that criterion is not satisfied, do not call this function.  You will
+   get an assertion failure. */
 	    
 static Charbpos
 boyer_moore (struct buffer *buf, Ibyte *base_pat, Bytecount len,
 	     Bytebpos pos, Bytebpos lim, EMACS_INT n, Lisp_Object trt,
 	     Lisp_Object inverse_trt, int USED_IF_MULE (charset_base))
 {
-  /* &&#### needs some 8-bit work here */
   /* #### Someone really really really needs to comment the workings
      of this junk somewhat better.
 
＠＠ -1602,6 +1650,13 ＠＠ boyer_moore (struct buffer *buf, Ibyte *
 #ifdef MULE
   Ibyte translate_prev_byte = 0;
   Ibyte translate_anteprev_byte = 0;
+  /* These need to be rethought in the event that the internal format
+     changes, or in the event that num_8_bit_fixed_chars disappears
+     (entirely_one_byte_p can be trivially worked out by checking is the
+     byte count equal to the char count.)  */
+  int buffer_entirely_one_byte_p = buf->text->entirely_one_byte_p;
+  int buffer_nothing_greater_than_0xff =
+    buf->text->num_8_bit_fixed_chars == BUF_Z(buf) - BUF_BEG (buf);
 #endif
 #ifdef C_ALLOCA
   EMACS_INT BM_tab_space[0400];
＠＠ -1684,20 +1739,47 ＠＠ boyer_moore (struct buffer *buf, Ibyte *
 	      while (!ibyte_first_byte_p (*charstart))
 		charstart--;
 	      untranslated = itext_ichar (charstart);
-	      if (charset_base == (untranslated & ~ICHAR_FIELD3_MASK))
-		{
-		  ch = TRANSLATE (trt, untranslated);
-		  if (!ibyte_first_byte_p (*ptr))
-		    {
-		      translate_prev_byte = ptr[-1];
-		      if (!ibyte_first_byte_p (translate_prev_byte))
-			translate_anteprev_byte = ptr[-2];
-		    }
-		}
-	      else
-		{
-		  this_translated = 0;
-		  ch = *ptr;
+
+              /* We shouldn't have been passed a string with varying
+                 character sets or rows. That's what simple_search is
+                 for.  */
+              assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
+
+              ch = TRANSLATE (trt, untranslated);
+              if (!ibyte_first_byte_p (*ptr))
+                {
+                  translate_prev_byte = ptr[-1];
+                  if (!ibyte_first_byte_p (translate_prev_byte))
+                    translate_anteprev_byte = ptr[-2];
+                }
+
+              if (charset_base != (ch & ~ICHAR_FIELD3_MASK))
+                {
+                  /* In the very rare event that the CANON entry for this
+                     character is not in the desired set, choose one that
+                     is, from the equivalence set. It doesn't much matter
+                     which, since we're building our own cheesy equivalence
+                     table instead of using that belonging to the case
+                     table directly.
+
+                     We can get here if search_buffer has worked out that
+                     the buffer is entirely single width. */
+                  Ichar starting_ch = ch;
+                  do
+                    {
+                      ch = TRANSLATE (inverse_trt, ch);
+                      if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
+                        break;
+
+                    } while (starting_ch != ch);
+
+                  /* If starting_ch is equal to ch, the case table is
+                     corrupt. (Any mapping in the canon table should be
+                     reflected in the equivalence table, and we know from
+                     the canon table that untranslated maps to starting_ch
+                     and that untranslated has the correct value for
+                     charset_base.) */
+                  assert (starting_ch != ch);
 		}
 	    }
 	  else
＠＠ -1713,28 +1795,34 ＠＠ boyer_moore (struct buffer *buf, Ibyte *
 	  if (i == infinity)
 	    stride_for_teases = BM_tab[j];
 	  BM_tab[j] = dirlen - i;
-	  /* A translation table is accompanied by its inverse --
-	     see comment in casetab.c. */
+	  /* A translation table is accompanied by its inverse -- see
+	     comment in casetab.c. */
 	  if (this_translated)
 	    {
 	      Ichar starting_ch = ch;
 	      EMACS_INT starting_j = j;
-	      while (1)
+	      do
 		{
 		  ch = TRANSLATE (inverse_trt, ch);
-		  if (ch > 0400)
-		    j = ((unsigned char) ch | 0200);
-		  else
-		    j = (unsigned char) ch;
-
-		  /* For all the characters that map into CH,
-		     set up simple_translate to map the last byte
-		     into STARTING_J.  */
-		  simple_translate[j] = (Ibyte) starting_j;
-		  if (ch == starting_ch)
-		    break;
-		  BM_tab[j] = dirlen - i;
-		}
+
+                  if (ch > 0x7F && buffer_entirely_one_byte_p)
+                    continue;
+
+                  if (ch > 0xFF && buffer_nothing_greater_than_0xff)
+                    continue;
+
+                  if (ch > 0400)
+                    j = ((unsigned char) ch | 0200);
+                  else
+                    j = (unsigned char) ch;
+
+                  /* For all the characters that map into CH, set up
+                     simple_translate to map the last byte into
+                     STARTING_J.  */
+                  simple_translate[j] = (Ibyte) starting_j;
+                  BM_tab[j] = dirlen - i;
+
+		} while (ch != starting_ch);
 	    }
 #else
 	  EMACS_INT k;


_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://calypso.tux.org/cgi-bin/mailman/listinfo/xemacs-patches

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

carbon2-commit: Always use boyer_moore in ASCII or Latin-1 buffers with ASCII search strings.