i remember hitting this bug before, and ever since then i've been very leery of
shy grouping.
but i got so annoyed to find out about this bug and the horrendously messed up
pseudo-implementation of shy groups that we had that i spent a few hours tonight
that should have gone elsewhere fixing the bug.
attached is the patch. the expressions below work, and the reegxp test suite
passes [that's not saying much though].
everyone, please try out this patch and see if you notice *anything* different,
esp. wrt regexps. i feel pretty good about the patch but i may have done
something stupid ... if all goes well, perhaps we can maybe consider this for
21.4 ...
ben
Brady Montz wrote:
"Stefan Monnier" <monnier+lists.xemacs.beta/news/(a)RUM.cs.yale.edu>
writes:
> >>>>> "Brady" == Brady Montz <bradym(a)balestra.org>
writes:
> > Can anyone explain to me why:
> > (string-match "\\( *a+\\)+" "a a")
> > returns 0 and
> > (string-match "\\(?: *a+\\)+" "a a")
> > returns nil
>
> Because the regexp engine has several bugs in its handling of loops.
> Some of them are due to assumptions about the shape of compiled regexps
> which don't hold any more since (?:...) was introduced.
Oh, that's unfortunate. I'll see if I can get a version to work for me
without the shy grouping.
--
Brady Montz
bradym(a)balestra.org
--
ben
I'm sometimes slow in getting around to reading my mail, so if you
want to reach me faster, call 520-661-6661.
See
http://www.666.com/ben/chronic-pain/ for the hell I've been
through.
Index: regex.c
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs/src/regex.c,v
retrieving revision 1.25
diff -u -r1.25 regex.c
--- src/regex.c 2001/04/12 18:24:15 1.25
+++ src/regex.c 2001/04/25 11:47:21
@@ -415,7 +415,7 @@
/* Start remembering the text that is matched, for storing in a
register. Followed by one byte with the register number, in
- the range 0 to one less than the pattern buffer's re_nsub
+ the range 1 to the pattern buffer's re_ngroups
field. Then followed by one byte with the number of groups
inner to this one. (This last has to be part of the
start_memory only because we need it in the on_failure_jump
@@ -424,7 +424,7 @@
/* Stop remembering the text that is matched and store it in a
memory register. Followed by one byte with the register
- number, in the range 0 to one less than `re_nsub' in the
+ number, in the range 1 to `re_ngroups' in the
pattern buffer, and one byte with the number of inner groups,
just like `start_memory'. (We need the number of inner
groups here because we don't have any easy way of finding the
@@ -971,6 +971,7 @@
}
printf ("re_nsub: %ld\t", (long)bufp->re_nsub);
+ printf ("re_ngroups: %ld\t", (long)bufp->re_ngroups);
printf ("regs_alloc: %d\t", bufp->regs_allocated);
printf ("can_be_null: %d\t", bufp->can_be_null);
printf ("newline_anchor: %d\n", bufp->newline_anchor);
@@ -980,6 +981,20 @@
printf ("syntax: %d\n", bufp->syntax);
/* Perhaps we should print the translate table? */
/* and maybe the category table? */
+
+ if (bufp->external_to_internal_register)
+ {
+ int i;
+
+ printf ("external_to_internal_register:\n");
+ for (i = 0; i <= bufp->re_nsub; i++)
+ {
+ if (i > 0)
+ printf (", ");
+ printf ("%d -> %d", i, bufp->external_to_internal_register[i]);
+ }
+ printf ("\n");
+ }
}
@@ -1694,9 +1709,13 @@
#define MAX_REGNUM 255
/* But patterns can have more than `MAX_REGNUM' registers. We just
- ignore the excess. */
+ ignore the excess.
+ #### not true! groups past this will fail in lots of ways, if we
+ ever have to backtrack.
+ */
typedef unsigned regnum_t;
+#define INIT_REG_TRANSLATE_SIZE 5
/* Macros for the compile stack. */
@@ -1880,7 +1899,9 @@
`syntax' is set to SYNTAX;
`used' is set to the length of the compiled pattern;
`fastmap_accurate' is zero;
- `re_nsub' is the number of subexpressions in PATTERN;
+ `re_ngroups' is the number of groups/subexpressions (including shy
+ groups) in PATTERN;
+ `re_nsub' is the number of non-shy groups in PATTERN;
`not_bol' and `not_eol' are zero;
The `fastmap' and `newline_anchor' fields are neither
@@ -1978,7 +1999,24 @@
/* Always count groups, whether or not bufp->no_sub is set. */
bufp->re_nsub = 0;
+ bufp->re_ngroups = 0;
+
+ if (bufp->external_to_internal_register == 0)
+ {
+ bufp->external_to_internal_register_size = INIT_REG_TRANSLATE_SIZE;
+ RETALLOC (bufp->external_to_internal_register,
+ bufp->external_to_internal_register_size,
+ int);
+ }
+
+ {
+ int i;
+ bufp->external_to_internal_register[0] = 0;
+ for (i = 1; i < bufp->external_to_internal_register_size; i++)
+ bufp->external_to_internal_register[i] = (int) 0xDEADBEEF;
+ }
+
#if !defined (emacs) && !defined (SYNTAX_TABLE)
/* Initialize the syntax table. */
init_syntax_once ();
@@ -2560,6 +2598,7 @@
handle_open:
{
regnum_t r;
+ int shy = 0;
if (!(syntax & RE_NO_SHY_GROUPS)
&& p != pend
@@ -2570,20 +2609,41 @@
switch (c)
{
case ':': /* shy groups */
- r = MAX_REGNUM + 1;
+ shy = 1;
break;
/* All others are reserved for future constructs. */
default:
FREE_STACK_RETURN (REG_BADPAT);
}
- }
- else
- {
- bufp->re_nsub++;
- r = ++regnum;
}
+ r = ++regnum;
+ bufp->re_ngroups++;
+ if (!shy)
+ {
+ bufp->re_nsub++;
+ while (bufp->external_to_internal_register_size <=
+ bufp->re_nsub)
+ {
+ int i;
+ int old_size =
+ bufp->external_to_internal_register_size;
+ bufp->external_to_internal_register_size += 5;
+ RETALLOC (bufp->external_to_internal_register,
+ bufp->external_to_internal_register_size,
+ int);
+ /* debugging */
+ for (i = old_size;
+ i < bufp->external_to_internal_register_size; i++)
+ bufp->external_to_internal_register[i] =
+ (int) 0xDEADBEEF;
+ }
+
+ bufp->external_to_internal_register[bufp->re_nsub] =
+ bufp->re_ngroups;
+ }
+
if (COMPILE_STACK_FULL)
{
RETALLOC (compile_stack.stack, compile_stack.size << 1,
@@ -2606,7 +2666,10 @@
/* We will eventually replace the 0 with the number of
groups inner to this one. But do not push a
start_memory for groups beyond the last one we can
- represent in the compiled pattern. */
+ represent in the compiled pattern.
+ #### bad bad bad. this will fail in lots of ways, if we
+ ever have to backtrack for these groups.
+ */
if (r <= MAX_REGNUM)
{
COMPILE_STACK_TOP.inner_group_offset
@@ -2997,17 +3060,44 @@
case '6': case '7': case '8': case '9':
{
regnum_t reg;
+ int may_need_to_unfetch = 0;
if (syntax & RE_NO_BK_REFS)
goto normal_char;
+ /* This only goes up to 99. It could be extended to work
+ up to 255 (the maximum number of registers that can be
+ handled by the current regexp engine, because it stores
+ its register numbers in the compiled pattern as one byte,
+ ugh). Doing that's a bit trickier, because you might
+ have the case where \25 a back-ref but \255 is not, ... */
reg = c - '0';
-
- if (reg > regnum)
+ if (p < pend)
+ {
+ PATFETCH (c);
+ if (c >= '0' && c <= '9')
+ {
+ regnum_t new_reg = reg * 10 + c - '0';
+ if (new_reg <= bufp->re_nsub)
+ {
+ reg = new_reg;
+ may_need_to_unfetch = 1;
+ }
+ else
+ PATUNFETCH;
+ }
+ }
+
+ if (reg > bufp->re_nsub)
FREE_STACK_RETURN (REG_ESUBREG);
+ reg = bufp->external_to_internal_register[reg];
/* Can't back reference to a subexpression if inside of it. */
if (group_in_compile_stack (compile_stack, reg))
- goto normal_char;
+ {
+ if (may_need_to_unfetch)
+ PATUNFETCH;
+ goto normal_char;
+ }
laststart = buf_end;
BUF_PUSH_2 (duplicate, reg);
@@ -3125,7 +3215,7 @@
isn't necessary unless we're trying to avoid calling alloca in
the search and match routines. */
{
- int num_regs = bufp->re_nsub + 1;
+ int num_regs = bufp->re_ngroups + 1;
/* Since DOUBLE_FAIL_STACK refuses to double only if the current size
is strictly greater than re_max_failures, the largest possible stack
@@ -4386,7 +4476,7 @@
/* We fill all the registers internally, independent of what we
return, for use in backreferences. The number here includes
an element for register zero. */
- unsigned num_regs = bufp->re_nsub + 1;
+ unsigned num_regs = bufp->re_ngroups + 1;
/* The currently active registers. */
unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
@@ -4472,7 +4562,7 @@
there are groups, we include space for register 0 (the whole
pattern), even though we never use it, since it simplifies the
array indexing. We should fix this. */
- if (bufp->re_nsub)
+ if (bufp->re_ngroups)
{
regstart = REGEX_TALLOC (num_regs, re_char *);
regend = REGEX_TALLOC (num_regs, re_char *);
@@ -4650,12 +4740,13 @@
/* If caller wants register contents data back, do it. */
if (regs && !bufp->no_sub)
{
+ int num_nonshy_regs = bufp->re_nsub + 1;
/* Have the register data arrays been allocated? */
if (bufp->regs_allocated == REGS_UNALLOCATED)
{ /* No. So allocate them with malloc. We need one
extra element beyond `num_regs' for the `-1' marker
GNU code uses. */
- regs->num_regs = MAX (RE_NREGS, num_regs + 1);
+ regs->num_regs = MAX (RE_NREGS, num_nonshy_regs + 1);
regs->start = TALLOC (regs->num_regs, regoff_t);
regs->end = TALLOC (regs->num_regs, regoff_t);
if (regs->start == NULL || regs->end == NULL)
@@ -4669,9 +4760,9 @@
{ /* Yes. If we need more elements than were already
allocated, reallocate them. If we need fewer, just
leave it alone. */
- if (regs->num_regs < num_regs + 1)
+ if (regs->num_regs < num_nonshy_regs + 1)
{
- regs->num_regs = num_regs + 1;
+ regs->num_regs = num_nonshy_regs + 1;
RETALLOC (regs->start, regs->num_regs, regoff_t);
RETALLOC (regs->end, regs->num_regs, regoff_t);
if (regs->start == NULL || regs->end == NULL)
@@ -4701,16 +4792,19 @@
/* Go through the first `min (num_regs, regs->num_regs)'
registers, since that is all we initialized. */
- for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
+ for (mcnt = 1; mcnt < MIN (num_nonshy_regs, regs->num_regs);
+ mcnt++)
{
- if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
+ int internal_reg = bufp->external_to_internal_register[mcnt];
+ if (REG_UNSET (regstart[internal_reg]) ||
+ REG_UNSET (regend[internal_reg]))
regs->start[mcnt] = regs->end[mcnt] = -1;
else
{
- regs->start[mcnt]
- = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
- regs->end[mcnt]
- = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
+ regs->start[mcnt] =
+ (regoff_t) POINTER_TO_OFFSET (regstart[internal_reg]);
+ regs->end[mcnt] =
+ (regoff_t) POINTER_TO_OFFSET (regend[internal_reg]);
}
}
@@ -4719,7 +4813,7 @@
we (re)allocated the registers, this is the case,
because we always allocate enough to have at least one
-1 at the end. */
- for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
+ for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
regs->start[mcnt] = regs->end[mcnt] = -1;
} /* regs && !bufp->no_sub */
@@ -5065,11 +5159,15 @@
/* \<digit> has been turned into a `duplicate' command which is
- followed by the numeric value of <digit> as the register number. */
+ followed by the numeric value of <digit> as the register number.
+ (Already passed through external-to-internal-register mapping,
+ so it refers to the actual group number, not the non-shy-only
+ numbering used in the external world.) */
case duplicate:
{
REGISTER re_char *d2, *dend2;
- int regno = *p++; /* Get which register to match against. */
+ /* Get which register to match against. */
+ int regno = *p++;
DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
/* Can't back reference a group which we've never matched. */
@@ -6222,6 +6320,8 @@
`newline_anchor' to REG_NEWLINE being set in CFLAGS;
`fastmap' and `fastmap_accurate' to zero;
`re_nsub' to the number of subexpressions in PATTERN.
+ (non-shy of course. POSIX probably doesn't know about
+ shy ones, and in any case they should be invisible.)
PATTERN is the address of the pattern string.
Index: regex.h
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs/src/regex.h,v
retrieving revision 1.7
diff -u -r1.7 regex.h
--- src/regex.h 2001/04/12 18:24:16 1.7
+++ src/regex.h 2001/04/25 11:47:25
@@ -153,6 +153,12 @@
If not set, then an unmatched ) is invalid. */
#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_SHY_GROUPS << 1)
+/* If this bit is set, then \22 will read as a back reference,
+ provided at least 22 non-shy groups have been seen so far. In all
+ other cases (bit not set, not 22 non-shy groups seen so far), it
+ reads as a back reference \2 followed by a digit 2. */
+#define RE_NO_MULTI_DIGIT_BK_REFS (RE_UNMATCHED_RIGHT_PAREN_ORD << 1)
+
/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect
@@ -170,7 +176,7 @@
| RE_NO_BK_PARENS | RE_NO_BK_REFS \
| RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \
| RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_SHY_GROUPS \
- | RE_NO_MINIMAL_MATCHING)
+ | RE_NO_MINIMAL_MATCHING | RE_NO_MULTI_DIGIT_BK_REFS)
#define RE_SYNTAX_POSIX_AWK \
(RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
@@ -179,17 +185,18 @@
(RE_BK_PLUS_QM | RE_CHAR_CLASSES \
| RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \
| RE_NEWLINE_ALT | RE_NO_SHY_GROUPS \
- | RE_NO_MINIMAL_MATCHING)
+ | RE_NO_MINIMAL_MATCHING | RE_NO_MULTI_DIGIT_BK_REFS)
#define RE_SYNTAX_EGREP \
(RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \
| RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \
| RE_NEWLINE_ALT | RE_NO_BK_PARENS \
| RE_NO_BK_VBAR | RE_NO_SHY_GROUPS \
- | RE_NO_MINIMAL_MATCHING)
+ | RE_NO_MINIMAL_MATCHING | RE_NO_MULTI_DIGIT_BK_REFS)
#define RE_SYNTAX_POSIX_EGREP \
- (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
+ (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES | \
+ RE_NO_MULTI_DIGIT_BK_REFS)
/* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */
#define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC
@@ -200,7 +207,7 @@
#define _RE_SYNTAX_POSIX_COMMON \
(RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
| RE_INTERVALS | RE_NO_EMPTY_RANGES | RE_NO_SHY_GROUPS \
- | RE_NO_MINIMAL_MATCHING)
+ | RE_NO_MINIMAL_MATCHING | RE_NO_MULTI_DIGIT_BK_REFS)
#define RE_SYNTAX_POSIX_BASIC \
(_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
@@ -337,9 +344,14 @@
when it is matched. */
RE_TRANSLATE_TYPE translate;
- /* Number of subexpressions found by the compiler. */
+ /* Number of returnable groups found by the compiler. (This does
+ not count shy groups.) */
size_t re_nsub;
+ /* Total number of groups found by the compiler. (Including
+ shy ones.) */
+ int re_ngroups;
+
/* Zero if this pattern cannot match the empty string, one else.
Well, in truth it's used only in `re_search_2', to see
whether or not we should use the fastmap, so we don't set
@@ -373,6 +385,12 @@
/* If true, an anchor at a newline matches. */
unsigned newline_anchor : 1;
+
+ /* Mapping between back references and groups (may not be
+ equivalent with shy groups). */
+ int *external_to_internal_register;
+
+ int external_to_internal_register_size;
/* [[[end pattern_buffer]]] */
};