APPROVE COMMIT
NOTE: This patch has been committed.
# HG changeset patch
# User Aidan Kehoe <kehoea(a)parhasard.net>
# Date 1487457620 0
# Sat Feb 18 22:40:20 2017 +0000
# Node ID 1b46b9bed2b920005232158a29a3082ce41046b0
# Parent 7b5e4836946734e0c1c38b2a4a98638f44d76ff2
Further bug fixing, regexp interval handling code.
src/ChangeLog addition:
2017-02-18 Aidan Kehoe <kehoea(a)parhasard.net>
Further bug fixing in the regexp interval handling code, thank you
for the bug report, Stephen Turnbull.
* regex.c:
* regex.c (itext_ichar_eql): Provide this for the external
programs that use regex.c
* regex.c (GET_UNSIGNED_NUMBER):
Revise this not to eat the next character after a number, even
with a zero-length number. Avoids a specific error seen with VM
reported by Stephen Turnbull, which I finally manage to provoke
today.
* regex.c (BAD_INTERVAL): New, macro to give up on encountering an
invalid macro or to fall back to parsing as a literal.
* regex.c (regex_compile):
1. Default LOWER_BOUND to 0 when parsing intervals, as does GNU
since 2000.
2. Error explicitly if an interval starts with +.
3. Use BAD_INTERVAL() as appropriate.
4. Adapt uses of GET_UNSIGNED_NUMBER() to its new calling
convention.
tests/ChangeLog addition:
2017-02-18 Aidan Kehoe <kehoea(a)parhasard.net>
* automated/regexp-tests.el:
Test several bugs (/ GNU incompatibilities) with the interval code
fixed today.
diff -r 7b5e48369467 -r 1b46b9bed2b9 src/ChangeLog
--- a/src/ChangeLog Tue Feb 14 23:41:49 2017 +0000
+++ b/src/ChangeLog Sat Feb 18 22:40:20 2017 +0000
@@ -1,3 +1,26 @@
+2017-02-18 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ Further bug fixing in the regexp interval handling code, thank you
+ for the bug report, Stephen Turnbull.
+
+ * regex.c:
+ * regex.c (itext_ichar_eql): Provide this for the external
+ programs that use regex.c
+ * regex.c (GET_UNSIGNED_NUMBER):
+ Revise this not to eat the next character after a number, even
+ with a zero-length number. Avoids a specific error seen with VM
+ reported by Stephen Turnbull, which I finally manage to provoke
+ today.
+ * regex.c (BAD_INTERVAL): New, macro to give up on encountering an
+ invalid macro or to fall back to parsing as a literal.
+ * regex.c (regex_compile):
+ 1. Default LOWER_BOUND to 0 when parsing intervals, as does GNU
+ since 2000.
+ 2. Error explicitly if an interval starts with +.
+ 3. Use BAD_INTERVAL() as appropriate.
+ 4. Adapt uses of GET_UNSIGNED_NUMBER() to its new calling
+ convention.
+
2017-02-14 Aidan Kehoe <kehoea(a)parhasard.net>
* data.c (Faref):
diff -r 7b5e48369467 -r 1b46b9bed2b9 src/regex.c
--- a/src/regex.c Tue Feb 14 23:41:49 2017 +0000
+++ b/src/regex.c Sat Feb 18 22:40:20 2017 +0000
@@ -98,6 +98,7 @@
#define itext_ichar(str) ((Ichar) (str)[0])
#define itext_ichar_fmt(str, fmt, object) ((Ichar) (str)[0])
#define itext_ichar_ascii_fmt(str, fmt, object) ((Ichar) (str)[0])
+#define itext_ichar_eql(str, ch) (((Ichar) (str)[0]) == (ch))
#if (LONGBITS > INTBITS)
# define EMACS_INT long
@@ -2143,7 +2144,7 @@
code. Also avoid the silent overflow issues of the non-emacs code below.
If the string at P is not exhausted, leave P pointing at the next
(probable-)non-digit byte encountered. */
-#define GET_UNSIGNED_NUMBER_1(num) do \
+#define GET_UNSIGNED_NUMBER(num) do \
{ \
Ibyte *_gus_numend = NULL; \
Lisp_Object _gus_numno; \
@@ -2159,8 +2160,7 @@
code that parses regexps is not aware of this. */ \
_gus_numno = parse_integer (p, &_gus_numend, limit, 10, 1, \
Vdigit_fixnum_ascii); \
- PATFETCH (c); \
- if (c != '-' && FIXNUMP (_gus_numno)) \
+ if (FIXNUMP (_gus_numno) && XREALFIXNUM (_gus_numno) >= 0) \
{ \
num = XREALFIXNUM (_gus_numno); \
p = _gus_numend; \
@@ -2168,7 +2168,7 @@
} while (0)
#else
/* Get the next unsigned number in the uncompiled pattern. */
-#define GET_UNSIGNED_NUMBER_1(num) \
+#define GET_UNSIGNED_NUMBER(num) \
{ if (p != pend) \
{ \
int _gun_do_unfetch = 1; \
@@ -2194,16 +2194,6 @@
}
#endif
-#define GET_UNSIGNED_NUMBER(num) do \
- { \
- GET_UNSIGNED_NUMBER_1 (num); \
- if (p != pend) \
- { \
- PATFETCH (c); \
- } \
- } while (0)
-
-
/* Map a string to the char class it names (if any). BEG points to the string
to be parsed and LIMIT is the length, in bytes, of that string.
@@ -3476,41 +3466,55 @@
|| (p - 2 == pattern && p == pend))
goto normal_backslash;
+#define BAD_INTERVAL() \
+ do \
+ { \
+ if (syntax & RE_NO_BK_BRACES) \
+ { \
+ goto unfetch_interval; \
+ } \
+ else \
+ { \
+ FREE_STACK_RETURN (REG_EBRACE); \
+ } \
+ } while (0)
+
handle_interval:
{
/* If got here, then the syntax allows intervals. */
/* At least (most) this many matches must be made. */
- int lower_bound = -1, upper_bound = -1;
+ int lower_bound = 0, upper_bound = -1;
beg_interval = p - 1;
- if (p == pend)
- {
- if (syntax & RE_NO_BK_BRACES)
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_EBRACE);
- }
+ if (p == pend || itext_ichar_eql (p, '+')) BAD_INTERVAL ();
GET_UNSIGNED_NUMBER (lower_bound);
+ if (p == pend) BAD_INTERVAL ();
+ PATFETCH (c);
+
if (c == ',')
{
+ if (p == pend || itext_ichar_eql (p, '+'))
+ BAD_INTERVAL ();
+
GET_UNSIGNED_NUMBER (upper_bound);
if (upper_bound < 0) upper_bound = RE_DUP_MAX;
+
+ if (p == pend) BAD_INTERVAL ();
+ PATFETCH (c);
}
else
- /* Interval such as `{1}' => match exactly once. */
- upper_bound = lower_bound;
-
- if (lower_bound < 0 || upper_bound > RE_DUP_MAX
- || lower_bound > upper_bound)
{
- if (syntax & RE_NO_BK_BRACES)
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_BADBR);
+ /* Interval such as `{1}' => match exactly once. */
+ upper_bound = lower_bound;
+ }
+
+ if (upper_bound > RE_DUP_MAX || lower_bound > upper_bound)
+ {
+ BAD_INTERVAL ();
}
if (!(syntax & RE_NO_BK_BRACES))
@@ -3524,15 +3528,12 @@
if (c != '}')
{
- if (syntax & RE_NO_BK_BRACES)
- goto unfetch_interval;
- else
- FREE_STACK_RETURN (REG_BADBR);
+ BAD_INTERVAL ();
}
/* We just parsed a valid interval. */
- /* If it's invalid to have no preceding re. */
+ /* It's invalid to have no preceding RE. */
if (!laststart)
{
if (syntax & RE_CONTEXT_INVALID_OPS)
@@ -3622,6 +3623,7 @@
beg_interval = NULL;
}
break;
+#undef BAD_INTERVAL
unfetch_interval:
/* If an invalid interval, match the characters as literals. */
@@ -3731,10 +3733,7 @@
goto normal_char;
PATUNFETCH;
- GET_UNSIGNED_NUMBER_1 (reg); /* We want P pointing at the next
- non-digit character, don't use
- GET_UNSIGNED_NUMBER, which
- consumes that. */
+ GET_UNSIGNED_NUMBER (reg);
/* Progressively divide down the backreference until we find
one that corresponds to an existing register. */
diff -r 7b5e48369467 -r 1b46b9bed2b9 tests/ChangeLog
--- a/tests/ChangeLog Tue Feb 14 23:41:49 2017 +0000
+++ b/tests/ChangeLog Sat Feb 18 22:40:20 2017 +0000
@@ -1,3 +1,9 @@
+2017-02-18 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * automated/regexp-tests.el:
+ Test several bugs (/ GNU incompatibilities) with the interval code
+ fixed today.
+
2017-02-14 Aidan Kehoe <kehoea(a)parhasard.net>
* automated/lisp-tests.el (probe-bounds-of-aref):
diff -r 7b5e48369467 -r 1b46b9bed2b9 tests/automated/regexp-tests.el
--- a/tests/automated/regexp-tests.el Tue Feb 14 23:41:49 2017 +0000
+++ b/tests/automated/regexp-tests.el Sat Feb 18 22:40:20 2017 +0000
@@ -1214,6 +1214,39 @@
;; GET_UNSIGNED_NUMBER in regex.c used to eat the next character, check it
;; doesn't anymore.
(Assert (eql (string-match "\\(x\\)\\(\\1\\)" "xx") 0))
+(Assert (eql (string-match "[\t\f\n\r]\\{1,\\}" "hello\tthere")
+ (length "hello")))
+
+(Assert (eql (string-match "[\t\f\n\r]\\{1\\}" "hello\tthere")
+ (length "hello")))
+
+(Check-Error invalid-regexp
+ (string-match "[\t\f\n\r]\\{+1,\\}" "hello\tthere"))
+
+;; LOWER_BOUND defaulting to 0 is a GNU change introduced 99633e97e95 in March
+;; 2000 by Stefan Monnier.
+(Assert (eql (string-match "[\t\f\n\r]\\{,1\\}" "hello\tthere")
+ (string-match "[\t\f\n\r]\\{0,1\\}" "hello\tthere")))
+(Assert (eql (string-match "[\t\f\n\r]\\{,\\}" "hello\tthere") 0))
+
+;; Repeat counts are only sixteen bits.
+(Check-Error invalid-regexp
+ (string-match "[\t\f\n\r]\\{1073741821,1073741823\\}"
+ "hello\tthere"))
+;; And they're not negative.
+(Check-Error invalid-regexp
+ (string-match "[\t\f\n\r]\\{-3,-2\\}"
+ "hello\tthere"))
+
+;; Backreferences are even more limited.
+(Check-Error invalid-regexp
+ (string-match "\\([\t\f\n\r]\\{,1\\}\\)\\1073741823"
+ "hello\tthere"))
+
+;; The - and the 0 are treated as a literals in the following, there is no
+;; attempt to parse as a backreference and no error.
+(Assert (null (string-match "\\([\t\f\n\r]\\{,1\\}\\)\\-1" "hello\tthere")))
+(Assert (null (string-match "\\([\t\f\n\r]\\{,1\\}\\)\\0" "hello\tthere"))
(with-temp-buffer
(insert "hi there")
--
‘As I sat looking up at the Guinness ad, I could never figure out /
How your man stayed up on the surfboard after forty pints of stout’
(C. Moore)