APPROVE COMMIT
NOTE: This patch has been committed.
# HG changeset patch
# User Aidan Kehoe <kehoea(a)parhasard.net>
# Date 1487457620 0
#      Sat Feb 18 22:40:20 2017 +0000
# Node ID 1b46b9bed2b920005232158a29a3082ce41046b0
# Parent  7b5e4836946734e0c1c38b2a4a98638f44d76ff2
Further bug fixing, regexp interval handling code.
src/ChangeLog addition:
2017-02-18  Aidan Kehoe  <kehoea(a)parhasard.net>
	Further bug fixing in the regexp interval handling code, thank you
	for the bug report, Stephen Turnbull.
	* regex.c:
	* regex.c (itext_ichar_eql): Provide this for the external
	programs that use regex.c
	* regex.c (GET_UNSIGNED_NUMBER):
	Revise this not to eat the next character after a number, even
	with a zero-length number. Avoids a specific error seen with VM
	reported by Stephen Turnbull, which I finally manage to provoke
	today.
	* regex.c (BAD_INTERVAL): New, macro to give up on encountering an
	invalid macro or to fall back to parsing as a literal.
	* regex.c (regex_compile):
	1. Default LOWER_BOUND to 0 when parsing intervals, as does GNU
	since 2000.
	2. Error explicitly if an interval starts with +.
	3. Use BAD_INTERVAL() as appropriate.
	4. Adapt uses of GET_UNSIGNED_NUMBER() to its new calling
	convention.
tests/ChangeLog addition:
2017-02-18  Aidan Kehoe  <kehoea(a)parhasard.net>
	* automated/regexp-tests.el:
	Test several bugs (/ GNU incompatibilities) with the interval code
	fixed today.
diff -r 7b5e48369467 -r 1b46b9bed2b9 src/ChangeLog
--- a/src/ChangeLog	Tue Feb 14 23:41:49 2017 +0000
+++ b/src/ChangeLog	Sat Feb 18 22:40:20 2017 +0000
@@ -1,3 +1,26 @@
+2017-02-18  Aidan Kehoe  <kehoea(a)parhasard.net>
+
+	Further bug fixing in the regexp interval handling code, thank you
+	for the bug report, Stephen Turnbull.
+
+	* regex.c:
+	* regex.c (itext_ichar_eql): Provide this for the external
+	programs that use regex.c
+	* regex.c (GET_UNSIGNED_NUMBER):
+	Revise this not to eat the next character after a number, even
+	with a zero-length number. Avoids a specific error seen with VM
+	reported by Stephen Turnbull, which I finally manage to provoke
+	today.
+	* regex.c (BAD_INTERVAL): New, macro to give up on encountering an
+	invalid macro or to fall back to parsing as a literal.
+	* regex.c (regex_compile):
+	1. Default LOWER_BOUND to 0 when parsing intervals, as does GNU
+	since 2000.
+	2. Error explicitly if an interval starts with +.
+	3. Use BAD_INTERVAL() as appropriate.
+	4. Adapt uses of GET_UNSIGNED_NUMBER() to its new calling
+	convention.
+
 2017-02-14  Aidan Kehoe  <kehoea(a)parhasard.net>
 
 	* data.c (Faref):
diff -r 7b5e48369467 -r 1b46b9bed2b9 src/regex.c
--- a/src/regex.c	Tue Feb 14 23:41:49 2017 +0000
+++ b/src/regex.c	Sat Feb 18 22:40:20 2017 +0000
@@ -98,6 +98,7 @@
 #define itext_ichar(str)				((Ichar) (str)[0])
 #define itext_ichar_fmt(str, fmt, object)		((Ichar) (str)[0])
 #define itext_ichar_ascii_fmt(str, fmt, object)	((Ichar) (str)[0])
+#define itext_ichar_eql(str, ch)                (((Ichar) (str)[0]) == (ch))
 
 #if (LONGBITS > INTBITS)
 # define EMACS_INT long
@@ -2143,7 +2144,7 @@
    code. Also avoid the silent overflow issues of the non-emacs code below.
    If the string at P is not exhausted, leave P pointing at the next
    (probable-)non-digit byte encountered. */
-#define GET_UNSIGNED_NUMBER_1(num) do \
+#define GET_UNSIGNED_NUMBER(num) do \
     {                                                                   \
       Ibyte *_gus_numend = NULL;                                        \
       Lisp_Object _gus_numno;                                           \
@@ -2159,8 +2160,7 @@
          code that parses regexps is not aware of this. */              \
       _gus_numno = parse_integer (p, &_gus_numend, limit, 10, 1,        \
                                   Vdigit_fixnum_ascii);                 \
-      PATFETCH (c);                                                     \
-      if (c != '-' && FIXNUMP (_gus_numno))                            
\
+      if (FIXNUMP (_gus_numno) && XREALFIXNUM (_gus_numno) >= 0)        \
         {                                                               \
           num = XREALFIXNUM (_gus_numno);                               \
           p = _gus_numend;                                              \
@@ -2168,7 +2168,7 @@
     } while (0)
 #else
 /* Get the next unsigned number in the uncompiled pattern.  */
-#define GET_UNSIGNED_NUMBER_1(num) 					\
+#define GET_UNSIGNED_NUMBER(num) 					\
   { if (p != pend)							\
      {									\
        int _gun_do_unfetch = 1;                                         \
@@ -2194,16 +2194,6 @@
   }
 #endif
 
-#define GET_UNSIGNED_NUMBER(num) do                                     \
-    {                                                                   \
-      GET_UNSIGNED_NUMBER_1 (num);                                      \
-      if (p != pend)                                                    \
-        {                                                               \
-          PATFETCH (c);                                                 \
-        }                                                               \
-    } while (0)
-
-
 /* Map a string to the char class it names (if any). BEG points to the string
    to be parsed and LIMIT is the length, in bytes, of that string.
 
@@ -3476,41 +3466,55 @@
                   || (p - 2 == pattern  &&  p == pend))
                 goto normal_backslash;
 
+#define BAD_INTERVAL()                                  \
+                do                                      \
+                  {                                     \
+                    if (syntax & RE_NO_BK_BRACES)       \
+                      {                                 \
+                        goto unfetch_interval;          \
+                      }                                 \
+                    else                                \
+                      {                                 \
+                        FREE_STACK_RETURN (REG_EBRACE); \
+                      }                                 \
+                  } while (0)
+
             handle_interval:
               {
                 /* If got here, then the syntax allows intervals.  */
 
                 /* At least (most) this many matches must be made.  */
-                int lower_bound = -1, upper_bound = -1;
+                int lower_bound = 0, upper_bound = -1;
 
                 beg_interval = p - 1;
 
-                if (p == pend)
-                  {
-                    if (syntax & RE_NO_BK_BRACES)
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_EBRACE);
-                  }
+                if (p == pend || itext_ichar_eql (p, '+')) BAD_INTERVAL ();
 
                 GET_UNSIGNED_NUMBER (lower_bound);
 
+                if (p == pend) BAD_INTERVAL ();
+                PATFETCH (c);
+
                 if (c == ',')
                   {
+                    if (p == pend || itext_ichar_eql (p, '+'))
+                      BAD_INTERVAL ();
+
                     GET_UNSIGNED_NUMBER (upper_bound);
                     if (upper_bound < 0) upper_bound = RE_DUP_MAX;
+
+                    if (p == pend) BAD_INTERVAL ();
+                    PATFETCH (c);
                   }
                 else
-                  /* Interval such as `{1}' => match exactly once. */
-                  upper_bound = lower_bound;
-
-                if (lower_bound < 0 || upper_bound > RE_DUP_MAX
-                    || lower_bound > upper_bound)
                   {
-                    if (syntax & RE_NO_BK_BRACES)
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_BADBR);
+                    /* Interval such as `{1}' => match exactly once. */
+                    upper_bound = lower_bound;
+                  }
+
+                if (upper_bound > RE_DUP_MAX || lower_bound > upper_bound)
+                  {
+                    BAD_INTERVAL ();
                   }
 
                 if (!(syntax & RE_NO_BK_BRACES))
@@ -3524,15 +3528,12 @@
 
                 if (c != '}')
                   {
-                    if (syntax & RE_NO_BK_BRACES)
-                      goto unfetch_interval;
-                    else
-                      FREE_STACK_RETURN (REG_BADBR);
+                    BAD_INTERVAL ();
                   }
 
                 /* We just parsed a valid interval.  */
 
-                /* If it's invalid to have no preceding re.  */
+                /* It's invalid to have no preceding RE.  */
                 if (!laststart)
                   {
                     if (syntax & RE_CONTEXT_INVALID_OPS)
@@ -3622,6 +3623,7 @@
                 beg_interval = NULL;
               }
               break;
+#undef BAD_INTERVAL
 
             unfetch_interval:
               /* If an invalid interval, match the characters as literals.  */
@@ -3731,10 +3733,7 @@
 		  goto normal_char;
 
                 PATUNFETCH;
-                GET_UNSIGNED_NUMBER_1 (reg); /* We want P pointing at the next
-                                                non-digit character, don't use
-                                                GET_UNSIGNED_NUMBER, which
-                                                consumes that. */
+                GET_UNSIGNED_NUMBER (reg);
 		  
                 /* Progressively divide down the backreference until we find
                    one that corresponds to an existing register. */
diff -r 7b5e48369467 -r 1b46b9bed2b9 tests/ChangeLog
--- a/tests/ChangeLog	Tue Feb 14 23:41:49 2017 +0000
+++ b/tests/ChangeLog	Sat Feb 18 22:40:20 2017 +0000
@@ -1,3 +1,9 @@
+2017-02-18  Aidan Kehoe  <kehoea(a)parhasard.net>
+
+	* automated/regexp-tests.el:
+	Test several bugs (/ GNU incompatibilities) with the interval code
+	fixed today.
+
 2017-02-14  Aidan Kehoe  <kehoea(a)parhasard.net>
 
 	* automated/lisp-tests.el (probe-bounds-of-aref):
diff -r 7b5e48369467 -r 1b46b9bed2b9 tests/automated/regexp-tests.el
--- a/tests/automated/regexp-tests.el	Tue Feb 14 23:41:49 2017 +0000
+++ b/tests/automated/regexp-tests.el	Sat Feb 18 22:40:20 2017 +0000
@@ -1214,6 +1214,39 @@
 ;; GET_UNSIGNED_NUMBER in regex.c used to eat the next character, check it
 ;; doesn't anymore.
 (Assert (eql (string-match "\\(x\\)\\(\\1\\)" "xx") 0))
+(Assert (eql (string-match "[\t\f\n\r]\\{1,\\}" "hello\tthere")
+             (length "hello")))
+
+(Assert (eql (string-match "[\t\f\n\r]\\{1\\}" "hello\tthere")
+             (length "hello")))
+
+(Check-Error invalid-regexp
+             (string-match "[\t\f\n\r]\\{+1,\\}" "hello\tthere"))
+
+;; LOWER_BOUND defaulting to 0 is a GNU change introduced 99633e97e95 in March
+;; 2000 by Stefan Monnier.
+(Assert (eql (string-match "[\t\f\n\r]\\{,1\\}" "hello\tthere")
+             (string-match "[\t\f\n\r]\\{0,1\\}" "hello\tthere")))
+(Assert (eql (string-match "[\t\f\n\r]\\{,\\}" "hello\tthere") 0))
+
+;; Repeat counts are only sixteen bits.
+(Check-Error invalid-regexp
+             (string-match "[\t\f\n\r]\\{1073741821,1073741823\\}"
+                           "hello\tthere"))
+;; And they're not negative.
+(Check-Error invalid-regexp
+             (string-match "[\t\f\n\r]\\{-3,-2\\}"
+                           "hello\tthere"))
+
+;; Backreferences are even more limited.
+(Check-Error invalid-regexp
+             (string-match "\\([\t\f\n\r]\\{,1\\}\\)\\1073741823"
+                           "hello\tthere"))
+
+;; The - and the 0 are treated as a literals in the following, there is no
+;; attempt to parse as a backreference and no error.
+(Assert (null (string-match "\\([\t\f\n\r]\\{,1\\}\\)\\-1"
"hello\tthere")))
+(Assert (null (string-match "\\([\t\f\n\r]\\{,1\\}\\)\\0"
"hello\tthere"))
 
 (with-temp-buffer
   (insert "hi there")
-- 
‘As I sat looking up at the Guinness ad, I could never figure out /
How your man stayed up on the surfboard after forty pints of stout’
(C. Moore)