APPROVE COMMIT
NOTE: This patch has been committed.
This differs from the patch in 17594.33203.166072.603993(a)parhasard.net in
that the new file automated/lisp-reader-tests.el is included.
lisp/ChangeLog addition:
2006-08-05 Aidan Kehoe <kehoea(a)parhasard.net>
* lisp.el (forward-sexp):
Handle raw strings specially just as we do structures. Fixes
problems evaluating them in *scratch*.
man/ChangeLog addition:
2006-08-05 Aidan Kehoe <kehoea(a)parhasard.net>
* lispref/objects.texi (String Type):
Give details of the raw string syntax, taken from SXEmacs and
Python.
src/ChangeLog addition:
2006-08-05 Aidan Kehoe <kehoea(a)parhasard.net>
* lread.c (read_unicode_escape):
Refactor this code out from read_escape, since it's now called
from read_string as well.
* lread.c (read_escape):
Call read_unicode_escape instead of using inline code,
* lread.c (read_string):
Refactor out from read1, provide raw and honor_unicode options.
* lread.c (read_raw_string):
Added, a function that calls read_string with the correct
arguments for a raw string.
* lread.c (read1):
Pass raw strings to read_raw_string; pass strings to read_string.
tests/ChangeLog addition:
2006-08-05 Aidan Kehoe <kehoea(a)parhasard.net>
* automated/lisp-reader-tests.el:
New file, imported from Martin Kuehl's SXEmacs commit; test the
new raw string syntax, including the Unicode escapes, which
SXEmacs doesn't have.
XEmacs Trunk source patch:
Diff command: cvs -q diff -Nu
Files affected: tests/automated/lisp-reader-tests.el src/lread.c man/lispref/objects.texi
lisp/lisp.el
Index: lisp/lisp.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/lisp.el,v
retrieving revision 1.5
diff -u -u -r1.5 lisp.el
--- lisp/lisp.el 2001/04/12 18:21:29 1.5
+++ lisp/lisp.el 2006/08/04 22:47:01
@@ -60,19 +60,20 @@
(interactive "_p")
(or arg (setq arg 1))
;; XEmacs: evil hack! The other half of the evil hack below.
- (if (and (> arg 0) (looking-at "#s("))
- (goto-char (+ (point) 2)))
+ (if (and (> arg 0) (looking-at "#s(\\|#r[uU]\\{0,1\\}\""))
+ (goto-char (1+ (- (point) (- (match-end 0) (match-beginning 0))))))
(goto-char (or (scan-sexps (point) arg) (buffer-end arg)))
- (if (< arg 0) (backward-prefix-chars))
- ;; XEmacs: evil hack! Skip back over #s so that structures are read
- ;; properly. the current cheesified syntax tables just aren't up to
- ;; this.
- (if (and (< arg 0)
- (eq (char-after (point)) ?\()
- (>= (- (point) (point-min)) 2)
- (eq (char-after (- (point) 1)) ?s)
- (eq (char-after (- (point) 2)) ?#))
- (goto-char (- (point) 2))))
+ (when (< arg 0)
+ (backward-prefix-chars)
+ ;; XEmacs: evil hack! Skip back over #[sr] so that structures and raw
+ ;; strings are read properly. the current cheesified syntax tables just
+ ;; aren't up to this.
+ (let* ((diff (- (point) (point-min)))
+ (subject (buffer-substring (- (point) (min diff 3))
+ (1+ (point))))
+ (matched (string-match "#s(\\|#r[uU]\\{0,1\\}\"" subject)))
+ (if matched
+ (goto-char (1+ (- (point) (- (length subject) matched))))))))
(defun backward-sexp (&optional arg)
"Move backward across one balanced expression (sexp).
Index: man/lispref/objects.texi
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/lispref/objects.texi,v
retrieving revision 1.8
diff -u -u -r1.8 objects.texi
--- man/lispref/objects.texi 2006/04/29 14:36:54 1.8
+++ man/lispref/objects.texi 2006/08/04 22:47:01
@@ -1079,6 +1079,16 @@
escape any backslash or double-quote characters in the string with a
backslash, like this: @code{"this \" is an embedded quote"}.
+ An alternative syntax allows insertion of raw backslashes into a
+string, like this: @code{#r"this \ is an embedded backslash"}. In such
+a string, each character following a backslash is included literally in
+the string, and all backslashes are left in the string. This means that
+@code{#r"\""} is a valid string literal with two characters, a backslash
and a
+double-quote. It also means that a string with this syntax @emph{cannot end
+in a single backslash}. As with Python, from where this syntax was
+taken, you can specify @code{u} or @code{U} after the @code{#r} to
+specify that interpretation of Unicode escapes should be done.
+
The newline character is not special in the read syntax for strings;
if you write a new line between the double-quotes, it becomes a
character in the string. But an escaped newline---one that is preceded
Index: src/lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.78
diff -u -u -r1.78 lread.c
--- src/lread.c 2006/06/03 17:50:54 1.78
+++ src/lread.c 2006/08/04 22:47:03
@@ -1670,15 +1670,56 @@
return val;
}
+
+/* A Unicode escape, as in C# (though we only permit them in strings
+ and characters, not arbitrarily in the source code.) */
+static Ichar
+read_unicode_escape (Lisp_Object readcharfun, int unicode_hex_count)
+{
+ REGISTER Ichar i = 0, c;
+ REGISTER int count = 0;
+ Lisp_Object lisp_char;
+ while (++count <= unicode_hex_count)
+ {
+ c = readchar (readcharfun);
+ /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
+ if (c >= '0' && c <= '9') i = (i << 4) +
(c - '0');
+ else if (c >= 'a' && c <= 'f') i = (i << 4) +
(c - 'a') + 10;
+ else if (c >= 'A' && c <= 'F') i = (i << 4) +
(c - 'A') + 10;
+ else
+ {
+ syntax_error ("Non-hex digit used for Unicode escape",
+ make_char (c));
+ break;
+ }
+ }
+
+ lisp_char = Funicode_to_char(make_int(i), Qnil);
+
+ if (EQ(Qnil, lisp_char))
+ {
+ /* This is ugly and horrible and trashes the user's data, but
+ it's what unicode.c does. In the future, unicode-to-char
+ should not return nil. */
+#ifdef MULE
+ i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
+#else
+ i = '~';
+#endif
+ return i;
+ }
+ else
+ {
+ return XCHAR(lisp_char);
+ }
+}
+
static Ichar
read_escape (Lisp_Object readcharfun)
{
/* This function can GC */
Ichar c = readchar (readcharfun);
- /* \u allows up to four hex digits, \U up to eight. Default to the
- behaviour for \u, and change this value in the case that \U is seen. */
- int unicode_hex_count = 4;
if (c < 0)
signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun));
@@ -1797,50 +1838,11 @@
}
case 'U':
/* Post-Unicode-2.0: Up to eight hex chars */
- unicode_hex_count = 8;
+ return read_unicode_escape(readcharfun, 8);
case 'u':
+ /* Unicode-2.0 and before; four hex chars. */
+ return read_unicode_escape(readcharfun, 4);
- /* A Unicode escape, as in C# (though we only permit them in strings
- and characters, not arbitrarily in the source code.) */
- {
- REGISTER Ichar i = 0;
- REGISTER int count = 0;
- Lisp_Object lisp_char;
- while (++count <= unicode_hex_count)
- {
- c = readchar (readcharfun);
- /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
- if (c >= '0' && c <= '9') i = (i << 4) +
(c - '0');
- else if (c >= 'a' && c <= 'f') i = (i << 4) +
(c - 'a') + 10;
- else if (c >= 'A' && c <= 'F') i = (i <<
4) + (c - 'A') + 10;
- else
- {
- syntax_error ("Non-hex digit used for Unicode escape",
- make_char (c));
- break;
- }
- }
-
- lisp_char = Funicode_to_char(make_int(i), Qnil);
-
- if (EQ(Qnil, lisp_char))
- {
- /* This is ugly and horrible and trashes the user's data, but
- it's what unicode.c does. In the future, unicode-to-char
- should not return nil. */
-#ifdef MULE
- i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
-#else
- i = '~';
-#endif
- return i;
- }
- else
- {
- return XCHAR(lisp_char);
- }
- }
-
default:
return c;
}
@@ -2270,6 +2272,113 @@
}
#endif
+static Lisp_Object
+read_string (Lisp_Object readcharfun, Ichar delim, int raw,
+ int honor_unicode)
+{
+#ifdef I18N3
+ /* #### If the input stream is translating, then the string
+ should be marked as translatable by setting its
+ `string-translatable' property to t. .el and .elc files
+ normally are translating input streams. See Fgettext()
+ and print_internal(). */
+#endif
+ Ichar c;
+ int cancel = 0;
+
+ Lstream_rewind(XLSTREAM(Vread_buffer_stream));
+ while ((c = readchar(readcharfun)) >= 0 && c != delim)
+ {
+ if (c == '\\')
+ {
+ if (raw)
+ {
+ c = readchar(readcharfun);
+ if (honor_unicode && ('u' == c || 'U' == c))
+ {
+ c = read_unicode_escape(readcharfun,
+ 'U' == c ? 8 : 4);
+ }
+ else
+ {
+ /* For raw strings, insert the
+ backslash and the next char, */
+ Lstream_put_ichar(XLSTREAM
+ (Vread_buffer_stream),
+ '\\');
+ }
+ }
+ else
+ /* otherwise, backslash escapes the next char. */
+ c = read_escape(readcharfun);
+ }
+ /* c is -1 if \ newline has just been seen */
+ if (c == -1)
+ {
+ if (Lstream_byte_count
+ (XLSTREAM(Vread_buffer_stream)) ==
+ 0)
+ cancel = 1;
+ }
+ else
+ Lstream_put_ichar(XLSTREAM
+ (Vread_buffer_stream),
+ c);
+ QUIT;
+ }
+ if (c < 0)
+ return Fsignal(Qend_of_file,
+ list1(READCHARFUN_MAYBE(readcharfun)));
+
+ /* If purifying, and string starts with \ newline,
+ return zero instead. This is for doc strings
+ that we are really going to find in lib-src/DOC.nn.nn */
+ if (purify_flag && NILP(Vinternal_doc_file_name)
+ && cancel)
+ return Qzero;
+
+ Lstream_flush(XLSTREAM(Vread_buffer_stream));
+ return make_string(resizing_buffer_stream_ptr
+ (XLSTREAM(Vread_buffer_stream)),
+ Lstream_byte_count(XLSTREAM(Vread_buffer_stream)));
+}
+
+static Lisp_Object
+read_raw_string (Lisp_Object readcharfun)
+{
+ Ichar c;
+ Ichar permit_unicode = 0;
+
+ do {
+ c = reader_nextchar(readcharfun);
+ switch (c) {
+ /* #r:engine"my sexy raw string" -- raw string w/ flags*/
+ /* case ':': */
+ /* #ru"Hi there\u20AC \U000020AC" -- raw string, honouring Unicode. */
+ case 'u':
+ case 'U':
+ permit_unicode = c;
+ continue;
+
+ /* #r"my raw string" -- raw string */
+ case '\"':
+ return read_string(readcharfun, '\"', 1, permit_unicode);
+ /* invalid syntax */
+ default:
+ {
+ if (permit_unicode)
+ {
+ unreadchar(readcharfun, permit_unicode);
+ }
+ unreadchar(readcharfun, c);
+ return Fsignal(Qinvalid_read_syntax,
+ list1(build_string
+ ("unrecognized raw string syntax")));
+ }
+ }
+ } while (1);
+}
+
/* Read the next Lisp object from the stream READCHARFUN and return it.
If the return value is a cons whose car is Qunbound, then read1()
encountered a misplaced token (e.g. a right bracket, right paren,
@@ -2509,6 +2618,8 @@
case 'x': return read_integer (readcharfun, 16);
/* #b010 => 2 -- binary constant syntax */
case 'b': return read_integer (readcharfun, 2);
+ /* #r"raw\stringt" -- raw string syntax */
+ case 'r': return read_raw_string(readcharfun);
/* #s(foobar key1 val1 key2 val2) -- structure syntax */
case 's': return read_structure (readcharfun);
case '<':
@@ -2654,48 +2765,8 @@
}
case '\"':
- {
- /* String */
-#ifdef I18N3
- /* #### If the input stream is translating, then the string
- should be marked as translatable by setting its
- `string-translatable' property to t. .el and .elc files
- normally are translating input streams. See Fgettext()
- and print_internal(). */
-#endif
- int cancel = 0;
-
- Lstream_rewind (XLSTREAM (Vread_buffer_stream));
- while ((c = readchar (readcharfun)) >= 0
- && c != '\"')
- {
- if (c == '\\')
- c = read_escape (readcharfun);
- /* c is -1 if \ newline has just been seen */
- if (c == -1)
- {
- if (Lstream_byte_count (XLSTREAM (Vread_buffer_stream)) == 0)
- cancel = 1;
- }
- else
- Lstream_put_ichar (XLSTREAM (Vread_buffer_stream), c);
- QUIT;
- }
- if (c < 0)
- return Fsignal (Qend_of_file, list1 (READCHARFUN_MAYBE (readcharfun)));
-
- /* If purifying, and string starts with \ newline,
- return zero instead. This is for doc strings
- that we are really going to find in lib-src/DOC.nn.nn */
- if (purify_flag && NILP (Vinternal_doc_file_name) && cancel)
- return Qzero;
-
- Lstream_flush (XLSTREAM (Vread_buffer_stream));
- return
- make_string
- (resizing_buffer_stream_ptr (XLSTREAM (Vread_buffer_stream)),
- Lstream_byte_count (XLSTREAM (Vread_buffer_stream)));
- }
+ /* String */
+ return read_string(readcharfun, '\"', 0, 1);
default:
{
Index: tests/automated/lisp-reader-tests.el
===================================================================
RCS file: lisp-reader-tests.el
diff -N lisp-reader-tests.el
--- /dev/null Sat Aug 5 00:46:54 2006
+++ lisp-reader-tests.el Sat Aug 5 00:47:03 2006
@@ -0,0 +1,80 @@
+;; Copyright (C) 2005 Martin Kuehl.
+
+;; Author: Martin Kuehl <martin.kuehl(a)gmail.com>
+;; Maintainer: Martin Kuehl <martin.kuehl(a)gmail.com>
+;; Created: 2005
+;; Keywords: tests
+
+;; This file is NOT part of SXEmacs.
+
+;; SXEmacs is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+
+;; SXEmacs is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with SXEmacs; see the file COPYING. If not, write to the Free
+;; Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+;; 02111-1307, USA.
+
+;;; Synched up with: Not in FSF.
+
+;;; Commentary:
+
+;; Test the lisp reader.
+;; See test-harness.el for instructions on how to run these tests.
+
+;;; Raw Strings
+;;; ===========
+
+;; Equality to "traditional" strings
+;; ---------------------------------
+(dolist (strings '((#r"xyz" "xyz") ; no backslashes
+ (#r"\xyz" "\\xyz") ; backslash at start
+ (#r"\\xyz" "\\\\xyz") ; backslashes at start
+ (#r"\nxyz" "\\nxyz") ; escape seq. at start
+ (#r"\"xyz" "\\\"xyz") ; quote at start
+ (#r"xy\z" "xy\\z") ; backslash in middle
+ (#r"xy\\z" "xy\\\\z") ; backslashes in middle
+ (#r"xy\nz" "xy\\nz") ; escape seq. in middle
+ (#r"xy\"z" "xy\\\"z") ; quote in middle
+ ;;(#r"xyz\" "xyz\\") ; backslash at end: error
+ (#r"xyz\\" "xyz\\\\") ; backslashes at end
+ (#r"xyz\n" "xyz\\n") ; escape seq. at end
+ (#r"xyz\"" "xyz\\\"") ; quote at end
+ (#ru"\u00ABxyz" "\u00ABxyz") ; one Unicode escape
+ (#rU"\U000000ABxyz" "\U000000ABxyz") ; another Unicode escape
+ (#rU"xyz\u00AB" "xyz\u00AB") ; one Unicode escape
+ ))
+ (Assert (apply #'string= strings)))
+
+;; Odd number of backslashes at the end
+;; ------------------------------------
+(dolist (string '("#r\"xyz\\\"" ;
`#r"abc\"': escaped delimiter
+ "#r\"xyz\\\\\\\"" ; `#r"abc\\\"':
escaped delimiter
+ ))
+ (with-temp-buffer
+ (insert string)
+ (Check-Error end-of-file (eval-buffer))))
+
+;; Alternate string/regex delimiters
+;; ---------------------------------
+(dolist (string '("#r/xyz/" ; Perl syntax
+ "#r:ix/xyz/" ; Extended Perl syntax
+ "#r|xyz|" ; TeX syntax
+ "#r[xyz]" ; (uncommon) Perl syntax
+ "#r<xyz>" ; Perl6 syntax?
+ "#r(xyz)" ; arbitrary santax
+ "#r{xyz}" ; arbitrary santax
+ "#r,xyz," ; arbitrary santax
+ "#r!xyz!" ; arbitrary santax
+ ))
+ (with-temp-buffer
+ (insert string)
+ (Check-Error-Message invalid-read-syntax "unrecognized raw string"
+ (eval-buffer))))
--
Santa Maradona, priez pour moi!