changeset: 4414:df576f30c1d86395a584eef1558bb945483af1ab
tag: tip
user: Aidan Kehoe <kehoea(a)parhasard.net>
date: Wed Jan 30 09:26:59 2008 +0100
files: src/ChangeLog src/search.c tests/ChangeLog tests/automated/case-tests.el
description:
Correct case-insensitive search for non-case, non-ASCII chars. Add tests.
2008-01-30 Aidan Kehoe <kehoea(a)parhasard.net>
* automated/case-tests.el:
Check for a bug Mike Sperber reported; check algorithms used, if
available.
2008-01-30 Aidan Kehoe <kehoea(a)parhasard.net>
* search.c (debug-xemacs-searches):
New variable, available on debug builds. Used in
tests/automated/case-tests.el.
(search_buffer): Only store the charset_base for characters with
translations. Correct some comments, correct some checks. If
debug_xemacs_searches is non-zero, record which search was used.
(boyer_moore): Remove an assertion that was incorrect. Remove its
documentation. Correct an assertion dealing with equivalence
tables; we may end up looking through the equivalence table if a
non-ASCII non-case character was searched for.
diff -r dc84ec90b4633f49250715ff879aa66a7b6f8f99 -r
df576f30c1d86395a584eef1558bb945483af1ab src/ChangeLog
--- a/src/ChangeLog Mon Jan 28 08:13:37 2008 +0100
+++ b/src/ChangeLog Wed Jan 30 09:26:59 2008 +0100
@@ -1,3 +1,16 @@ 2008-01-24 Mike Sperber <mike(a)xemacs.o
+2008-01-30 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * search.c (debug-xemacs-searches):
+ New variable, available on debug builds. Used in
+ tests/automated/case-tests.el.
+ (search_buffer): Only store the charset_base for characters with
+ translations. Correct some comments, correct some checks. If
+ debug_xemacs_searches is non-zero, record which search was used.
+ (boyer_moore): Remove an assertion that was incorrect. Remove its
+ documentation. Correct an assertion dealing with equivalence
+ tables; we may end up looking through the equivalence table if a
+ non-ASCII non-case character was searched for.
+
2008-01-24 Mike Sperber <mike(a)xemacs.org>
* make-src-depend (PrintDeps): Fix Perl code that no longer works
diff -r dc84ec90b4633f49250715ff879aa66a7b6f8f99 -r
df576f30c1d86395a584eef1558bb945483af1ab src/search.c
--- a/src/search.c Mon Jan 28 08:13:37 2008 +0100
+++ b/src/search.c Wed Jan 30 09:26:59 2008 +0100
@@ -46,6 +46,15 @@ Boston, MA 02111-1307, USA. */
(!NILP (table) ? TRT_TABLE_OF (table, (Ichar) pos) : pos)
#define REGEXP_CACHE_SIZE 20
+
+#ifdef DEBUG_XEMACS
+
+/* Used in tests/automated/case-tests.el if available. */
+Fixnum debug_xemacs_searches;
+
+Lisp_Object Qsearch_algorithm_used, Qboyer_moore, Qsimple_search;
+
+#endif
/* If the regexp is non-nil, then the buffer contains the compiled form
of that regexp, suitable for searching. */
@@ -1370,16 +1379,10 @@ search_buffer (struct buffer *buf, Lisp_
inv_bytelen = set_itext_ichar (tmp_str, inverse);
new_bytelen = set_itext_ichar (tmp_str, translated);
- if (-1 == charset_base)
- {
- /* Keep track of which charset and character set row
- contains the characters that need translation.
-
- Zero out the bits corresponding to the last byte. */
- charset_base = c & ~ICHAR_FIELD3_MASK;
- }
-
- if (boyer_moore_ok && (translated != c || inverse != c))
+ if (boyer_moore_ok
+ /* Only do the Boyer-Moore check for characters needing
+ translation. */
+ && (translated != c || inverse != c))
{
Ichar starting_c = c;
int charset_base_code;
@@ -1396,21 +1399,34 @@ search_buffer (struct buffer *buf, Lisp_
if (c > 0xFF && nothing_greater_than_0xff)
continue;
- charset_base_code = c & ~ICHAR_FIELD3_MASK;
-
- if (charset_base_code != charset_base)
+ if (-1 == charset_base) /* No charset yet specified. */
{
- /* If two different rows, or two different charsets,
- appear, needing translation, then we cannot use
- boyer_moore search. See the comment at the head of
- boyer_moore(). */
- boyer_moore_ok = 0;
- break;
+ /* Keep track of which charset and character set row
+ contains the characters that need translation.
+
+ Zero out the bits corresponding to the last
+ byte. */
+ charset_base = c & ~ICHAR_FIELD3_MASK;
+ }
+ else
+ {
+ charset_base_code = c & ~ICHAR_FIELD3_MASK;
+
+ if (charset_base_code != charset_base)
+ {
+ /* If two different rows, or two different
+ charsets, appear, needing non-ASCII
+ translation, then we cannot use boyer_moore
+ search. See the comment at the head of
+ boyer_moore(). */
+ boyer_moore_ok = 0;
+ break;
+ }
}
} while (c != starting_c);
- if (boyer_moore_ok && (charset_base !=
- (translated & ~ICHAR_FIELD3_MASK)))
+ if (boyer_moore_ok && charset_base != -1 &&
+ charset_base != (translated & ~ICHAR_FIELD3_MASK))
{
/* In the rare event that the CANON entry for this
character is not in the desired set, choose one
@@ -1437,6 +1453,12 @@ search_buffer (struct buffer *buf, Lisp_
base_pat += orig_bytelen;
len -= orig_bytelen;
}
+
+ if (-1 == charset_base)
+ {
+ charset_base = 'a' & ~ICHAR_FIELD3_MASK; /* Default to ASCII. */
+ }
+
#else /* not MULE */
while (--len >= 0)
{
@@ -1453,6 +1475,15 @@ search_buffer (struct buffer *buf, Lisp_
#endif /* MULE */
len = pat - patbuf;
pat = base_pat = patbuf;
+
+#ifdef DEBUG_XEMACS
+ if (debug_xemacs_searches)
+ {
+ Lisp_Symbol *sym = XSYMBOL (Qsearch_algorithm_used);
+ sym->value = boyer_moore_ok ? Qboyer_moore : Qsimple_search;
+ }
+#endif
+
if (boyer_moore_ok)
return boyer_moore (buf, base_pat, len, pos, lim, n,
trt, inverse_trt, charset_base);
@@ -1595,9 +1626,9 @@ simple_search (struct buffer *buf, Ibyte
TRT and INVERSE_TRT are translation tables.
This kind of search works if all the characters in PAT that have
- nontrivial translation are the same aside from the last byte. This
- makes it possible to translate just the last byte of a character,
- and do so after just a simple test of the context.
+ (non-ASCII) translation are the same aside from the last byte. This
+ makes it possible to translate just the last byte of a character, and do
+ so after just a simple test of the context.
If that criterion is not satisfied, do not call this function. You will
get an assertion failure. */
@@ -1740,11 +1771,6 @@ boyer_moore (struct buffer *buf, Ibyte *
charstart--;
untranslated = itext_ichar (charstart);
- /* We shouldn't have been passed a string with varying
- character sets or rows. That's what simple_search is
- for. */
- assert (charset_base == (untranslated & ~ICHAR_FIELD3_MASK));
-
ch = TRANSLATE (trt, untranslated);
if (!ibyte_first_byte_p (*ptr))
{
@@ -1753,7 +1779,8 @@ boyer_moore (struct buffer *buf, Ibyte *
translate_anteprev_byte = ptr[-2];
}
- if (charset_base != (ch & ~ICHAR_FIELD3_MASK))
+ if (ch != untranslated && /* Was translation done? */
+ charset_base != (ch & ~ICHAR_FIELD3_MASK))
{
/* In the very rare event that the CANON entry for this
character is not in the desired set, choose one that
@@ -1765,21 +1792,23 @@ boyer_moore (struct buffer *buf, Ibyte *
We can get here if search_buffer has worked out that
the buffer is entirely single width. */
Ichar starting_ch = ch;
+ int count = 0;
do
{
ch = TRANSLATE (inverse_trt, ch);
if (charset_base == (ch & ~ICHAR_FIELD3_MASK))
break;
-
+ ++count;
} while (starting_ch != ch);
- /* If starting_ch is equal to ch, the case table is
- corrupt. (Any mapping in the canon table should be
- reflected in the equivalence table, and we know from
- the canon table that untranslated maps to starting_ch
- and that untranslated has the correct value for
- charset_base.) */
- assert (starting_ch != ch);
+ /* If starting_ch is equal to ch (and count is not one,
+ which means no translation is necessary), the case
+ table is corrupt. (Any mapping in the canon table
+ should be reflected in the equivalence table, and we
+ know from the canon table that untranslated maps to
+ starting_ch and that untranslated has the correct value
+ for charset_base.) */
+ assert (1 == count || starting_ch != ch);
}
}
else
@@ -3320,4 +3349,15 @@ occur and a back reference to one of the
Vskip_chars_range_table = Fmake_range_table (Qstart_closed_end_closed);
staticpro (&Vskip_chars_range_table);
-}
+#ifdef DEBUG_XEMACS
+ DEFSYMBOL (Qsearch_algorithm_used);
+ DEFSYMBOL (Qboyer_moore);
+ DEFSYMBOL (Qsimple_search);
+
+ DEFVAR_INT ("debug-xemacs-searches", &debug_xemacs_searches /*
+If non-zero, bind `search-algorithm-used' to `boyer-moore' or
`simple-search',
+depending on the algorithm used for each search. Used for testing.
+*/ );
+ debug_xemacs_searches = 0;
+#endif
+}
diff -r dc84ec90b4633f49250715ff879aa66a7b6f8f99 -r
df576f30c1d86395a584eef1558bb945483af1ab tests/ChangeLog
--- a/tests/ChangeLog Mon Jan 28 08:13:37 2008 +0100
+++ b/tests/ChangeLog Wed Jan 30 09:26:59 2008 +0100
@@ -1,3 +1,9 @@ 2008-01-16 Aidan Kehoe <kehoea@parhasa
+2008-01-30 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * automated/case-tests.el:
+ Check for a bug Mike Sperber reported; check algorithms used, if
+ available.
+
2008-01-16 Aidan Kehoe <kehoea(a)parhasard.net>
* automated/mule-tests.el (test-file-name):
diff -r dc84ec90b4633f49250715ff879aa66a7b6f8f99 -r
df576f30c1d86395a584eef1558bb945483af1ab tests/automated/case-tests.el
--- a/tests/automated/case-tests.el Mon Jan 28 08:13:37 2008 +0100
+++ b/tests/automated/case-tests.el Wed Jan 30 09:26:59 2008 +0100
@@ -268,3 +268,37 @@
(goto-char (point-max))
(Assert (not (search-backward string nil t 6))))))
+;; Bug reported in
http://mid.gmane.org/y9lk5lu5orq.fsf@deinprogramm.de from
+;; Michael Sperber. Fixed 2008-01-29.
+(with-string-as-buffer-contents "\n\nDer beruhmte deutsche Flei\xdf\n\n"
+ (goto-char (point-min))
+ (Assert (search-forward "Flei\xdf")))
+
+(Skip-Test-Unless
+ (boundp 'debug-xemacs-searches) ; normal when we have DEBUG_XEMACS
+ "not a DEBUG_XEMACS build"
+ (let ((debug-xemacs-searches 1))
+ (with-temp-buffer
+ (insert "\n\nDer beruhmte deutsche Fleiss\n\n")
+ (goto-char (point-min))
+ (search-forward "Fleiss")
+ (delete-region (point-min) (point-max))
+ (insert "\n\nDer beruhmte deutsche Flei\xdf\n\n")
+ (goto-char (point-min))
+ (search-forward "Flei\xdf")
+ (Assert (eq 'boyer-moore search-algorithm-used))
+ (delete-region (point-min) (point-max))
+ (when (featurep 'mule)
+ (insert "\n\nDer beruhmte deutsche Flei\xdf\n\n")
+ (goto-char (point-min))
+ (Assert
+ (search-forward (format "Fle%c\xdf"
+ (make-char 'latin-iso8859-9 #xfd))))
+ (Assert (eq 'boyer-moore search-algorithm-used))
+ (insert (make-char 'latin-iso8859-9 #xfd))
+ (goto-char (point-min))
+ (Assert
+ (search-forward (format "Fle%c\xdf"
+ (make-char 'latin-iso8859-9 #xfd))))
+ (Assert (eq 'simple-search search-algorithm-used))))))
+
_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://calypso.tux.org/cgi-bin/mailman/listinfo/xemacs-patches