This patch integrates Martin Kuehl’s raw strings into XEmacs 21.5, extending
it to optionally handle Unicode escapes as does Python, and fixing a problem
that arose in lisp-interaction-mode, where raw strings were parsed as normal
strings if you did a C-j after them, becase forward-sexp couldn’t handle
them.
lisp/ChangeLog addition:
2006-07-16 Aidan Kehoe <kehoea(a)parhasard.net>
* lisp.el (forward-sexp):
Handle raw strings specially just as we do structures. Fixes
problems evaluating them in *scratch*.
man/ChangeLog addition:
2006-07-16 Aidan Kehoe <kehoea(a)parhasard.net>
* lispref/objects.texi (String Type):
Give details of the raw string syntax, taken from SXEmacs and
Python.
src/ChangeLog addition:
2006-07-16 Aidan Kehoe <kehoea(a)parhasard.net>
Martin Kuehl's raw string syntax, from Python via SXEmacs.
* lread.c (read_unicode_escape):
Refactor this code out from read_escape, since it's now called
from read_string as well.
* lread.c (read_escape):
Call read_unicode_escape instead of using inline code,
* lread.c (read_string):
Refactor out from read1, provide raw and honor_unicode options.
* lread.c (read_raw_string):
Added, a function that calls read_string with the correct
arguments for a raw string.
* lread.c (read1):
Pass raw strings to read_raw_string; pass strings to read_string.
XEmacs Trunk source patch:
Diff command: cvs -q diff -u
Files affected: src/lread.c man/lispref/objects.texi lisp/lisp.el
Index: lisp/lisp.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/lisp.el,v
retrieving revision 1.5
diff -u -u -r1.5 lisp.el
--- lisp/lisp.el 2001/04/12 18:21:29 1.5
+++ lisp/lisp.el 2006/07/16 17:56:06
@@ -60,19 +60,20 @@
(interactive "_p")
(or arg (setq arg 1))
;; XEmacs: evil hack! The other half of the evil hack below.
- (if (and (> arg 0) (looking-at "#s("))
- (goto-char (+ (point) 2)))
+ (if (and (> arg 0) (looking-at "#s(\\|#r[uU]\\{0,1\\}\""))
+ (goto-char (1+ (- (point) (- (match-end 0) (match-beginning 0))))))
(goto-char (or (scan-sexps (point) arg) (buffer-end arg)))
- (if (< arg 0) (backward-prefix-chars))
- ;; XEmacs: evil hack! Skip back over #s so that structures are read
- ;; properly. the current cheesified syntax tables just aren't up to
- ;; this.
- (if (and (< arg 0)
- (eq (char-after (point)) ?\()
- (>= (- (point) (point-min)) 2)
- (eq (char-after (- (point) 1)) ?s)
- (eq (char-after (- (point) 2)) ?#))
- (goto-char (- (point) 2))))
+ (when (< arg 0)
+ (backward-prefix-chars)
+ ;; XEmacs: evil hack! Skip back over #[sr] so that structures and raw
+ ;; strings are read properly. the current cheesified syntax tables just
+ ;; aren't up to this.
+ (let* ((diff (- (point) (point-min)))
+ (subject (buffer-substring (- (point) (min diff 3))
+ (1+ (point))))
+ (matched (string-match "#s(\\|#r[uU]\\{0,1\\}\"" subject)))
+ (if matched
+ (goto-char (1+ (- (point) (- (length subject) matched))))))))
(defun backward-sexp (&optional arg)
"Move backward across one balanced expression (sexp).
Index: man/lispref/objects.texi
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/lispref/objects.texi,v
retrieving revision 1.8
diff -u -u -r1.8 objects.texi
--- man/lispref/objects.texi 2006/04/29 14:36:54 1.8
+++ man/lispref/objects.texi 2006/07/16 17:56:11
@@ -1079,6 +1079,16 @@
escape any backslash or double-quote characters in the string with a
backslash, like this: @code{"this \" is an embedded quote"}.
+ An alternative syntax allows insertion of raw backslashes into a
+string, like this: @code{#r"this \ is an embedded backslash"}. In such
+a string, each character following a backslash is included literally in
+the string, and all backslashes are left in the string. This means that
+@code{#r"\""} is a valid string literal with two characters, a backslash and a
+double-quote. It also means that a string with this syntax @emph{cannot end
+in a single backslash}. As with Python, from where this syntax was
+taken, you can specify @code{u} or @code{U} after the @code{#r} to
+specify that interpretation of Unicode escapes should be done.
+
The newline character is not special in the read syntax for strings;
if you write a new line between the double-quotes, it becomes a
character in the string. But an escaped newline---one that is preceded
Index: src/lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.78
diff -u -u -r1.78 lread.c
--- src/lread.c 2006/06/03 17:50:54 1.78
+++ src/lread.c 2006/07/16 17:56:12
@@ -1670,15 +1670,56 @@
return val;
}
+
+/* A Unicode escape, as in C# (though we only permit them in strings
+ and characters, not arbitrarily in the source code.) */
+static Ichar
+read_unicode_escape (Lisp_Object readcharfun, int unicode_hex_count)
+{
+ REGISTER Ichar i = 0, c;
+ REGISTER int count = 0;
+ Lisp_Object lisp_char;
+ while (++count <= unicode_hex_count)
+ {
+ c = readchar (readcharfun);
+ /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
+ if (c >= '0' && c <= '9') i = (i << 4) + (c - '0');
+ else if (c >= 'a' && c <= 'f') i = (i << 4) + (c - 'a') + 10;
+ else if (c >= 'A' && c <= 'F') i = (i << 4) + (c - 'A') + 10;
+ else
+ {
+ syntax_error ("Non-hex digit used for Unicode escape",
+ make_char (c));
+ break;
+ }
+ }
+
+ lisp_char = Funicode_to_char(make_int(i), Qnil);
+
+ if (EQ(Qnil, lisp_char))
+ {
+ /* This is ugly and horrible and trashes the user's data, but
+ it's what unicode.c does. In the future, unicode-to-char
+ should not return nil. */
+#ifdef MULE
+ i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
+#else
+ i = '~';
+#endif
+ return i;
+ }
+ else
+ {
+ return XCHAR(lisp_char);
+ }
+}
+
static Ichar
read_escape (Lisp_Object readcharfun)
{
/* This function can GC */
Ichar c = readchar (readcharfun);
- /* \u allows up to four hex digits, \U up to eight. Default to the
- behaviour for \u, and change this value in the case that \U is seen. */
- int unicode_hex_count = 4;
if (c < 0)
signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun));
@@ -1797,50 +1838,11 @@
}
case 'U':
/* Post-Unicode-2.0: Up to eight hex chars */
- unicode_hex_count = 8;
+ return read_unicode_escape(readcharfun, 8);
case 'u':
+ /* Unicode-2.0 and before; four hex chars. */
+ return read_unicode_escape(readcharfun, 4);
- /* A Unicode escape, as in C# (though we only permit them in strings
- and characters, not arbitrarily in the source code.) */
- {
- REGISTER Ichar i = 0;
- REGISTER int count = 0;
- Lisp_Object lisp_char;
- while (++count <= unicode_hex_count)
- {
- c = readchar (readcharfun);
- /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
- if (c >= '0' && c <= '9') i = (i << 4) + (c - '0');
- else if (c >= 'a' && c <= 'f') i = (i << 4) + (c - 'a') + 10;
- else if (c >= 'A' && c <= 'F') i = (i << 4) + (c - 'A') + 10;
- else
- {
- syntax_error ("Non-hex digit used for Unicode escape",
- make_char (c));
- break;
- }
- }
-
- lisp_char = Funicode_to_char(make_int(i), Qnil);
-
- if (EQ(Qnil, lisp_char))
- {
- /* This is ugly and horrible and trashes the user's data, but
- it's what unicode.c does. In the future, unicode-to-char
- should not return nil. */
-#ifdef MULE
- i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
-#else
- i = '~';
-#endif
- return i;
- }
- else
- {
- return XCHAR(lisp_char);
- }
- }
-
default:
return c;
}
@@ -2270,6 +2272,113 @@
}
#endif
+static Lisp_Object
+read_string (Lisp_Object readcharfun, Ichar delim, int raw,
+ int honor_unicode)
+{
+#ifdef I18N3
+ /* #### If the input stream is translating, then the string
+ should be marked as translatable by setting its
+ `string-translatable' property to t. .el and .elc files
+ normally are translating input streams. See Fgettext()
+ and print_internal(). */
+#endif
+ Ichar c;
+ int cancel = 0;
+
+ Lstream_rewind(XLSTREAM(Vread_buffer_stream));
+ while ((c = readchar(readcharfun)) >= 0 && c != delim)
+ {
+ if (c == '\\')
+ {
+ if (raw)
+ {
+ c = readchar(readcharfun);
+ if (honor_unicode && ('u' == c || 'U' == c))
+ {
+ c = read_unicode_escape(readcharfun,
+ 'U' == c ? 8 : 4);
+ }
+ else
+ {
+ /* For raw strings, insert the
+ backslash and the next char, */
+ Lstream_put_ichar(XLSTREAM
+ (Vread_buffer_stream),
+ '\\');
+ }
+ }
+ else
+ /* otherwise, backslash escapes the next char. */
+ c = read_escape(readcharfun);
+ }
+ /* c is -1 if \ newline has just been seen */
+ if (c == -1)
+ {
+ if (Lstream_byte_count
+ (XLSTREAM(Vread_buffer_stream)) ==
+ 0)
+ cancel = 1;
+ }
+ else
+ Lstream_put_ichar(XLSTREAM
+ (Vread_buffer_stream),
+ c);
+ QUIT;
+ }
+ if (c < 0)
+ return Fsignal(Qend_of_file,
+ list1(READCHARFUN_MAYBE(readcharfun)));
+
+ /* If purifying, and string starts with \ newline,
+ return zero instead. This is for doc strings
+ that we are really going to find in lib-src/DOC.nn.nn */
+ if (purify_flag && NILP(Vinternal_doc_file_name)
+ && cancel)
+ return Qzero;
+
+ Lstream_flush(XLSTREAM(Vread_buffer_stream));
+ return make_string(resizing_buffer_stream_ptr
+ (XLSTREAM(Vread_buffer_stream)),
+ Lstream_byte_count(XLSTREAM(Vread_buffer_stream)));
+}
+
+static Lisp_Object
+read_raw_string (Lisp_Object readcharfun)
+{
+ Ichar c;
+ Ichar permit_unicode = 0;
+
+ do {
+ c = reader_nextchar(readcharfun);
+ switch (c) {
+ /* #r:engine"my sexy raw string" -- raw string w/ flags*/
+ /* case ':': */
+ /* #ru"Hi there\u20AC \U000020AC" -- raw string, honouring Unicode. */
+ case 'u':
+ case 'U':
+ permit_unicode = c;
+ continue;
+
+ /* #r"my raw string" -- raw string */
+ case '\"':
+ return read_string(readcharfun, '\"', 1, permit_unicode);
+ /* invalid syntax */
+ default:
+ {
+ if (permit_unicode)
+ {
+ unreadchar(readcharfun, permit_unicode);
+ }
+ unreadchar(readcharfun, c);
+ return Fsignal(Qinvalid_read_syntax,
+ list1(build_string
+ ("unrecognized raw string syntax")));
+ }
+ }
+ } while (1);
+}
+
/* Read the next Lisp object from the stream READCHARFUN and return it.
If the return value is a cons whose car is Qunbound, then read1()
encountered a misplaced token (e.g. a right bracket, right paren,
@@ -2509,6 +2618,8 @@
case 'x': return read_integer (readcharfun, 16);
/* #b010 => 2 -- binary constant syntax */
case 'b': return read_integer (readcharfun, 2);
+ /* #r"raw\stringt" -- raw string syntax */
+ case 'r': return read_raw_string(readcharfun);
/* #s(foobar key1 val1 key2 val2) -- structure syntax */
case 's': return read_structure (readcharfun);
case '<':
@@ -2654,48 +2765,8 @@
}
case '\"':
- {
- /* String */
-#ifdef I18N3
- /* #### If the input stream is translating, then the string
- should be marked as translatable by setting its
- `string-translatable' property to t. .el and .elc files
- normally are translating input streams. See Fgettext()
- and print_internal(). */
-#endif
- int cancel = 0;
-
- Lstream_rewind (XLSTREAM (Vread_buffer_stream));
- while ((c = readchar (readcharfun)) >= 0
- && c != '\"')
- {
- if (c == '\\')
- c = read_escape (readcharfun);
- /* c is -1 if \ newline has just been seen */
- if (c == -1)
- {
- if (Lstream_byte_count (XLSTREAM (Vread_buffer_stream)) == 0)
- cancel = 1;
- }
- else
- Lstream_put_ichar (XLSTREAM (Vread_buffer_stream), c);
- QUIT;
- }
- if (c < 0)
- return Fsignal (Qend_of_file, list1 (READCHARFUN_MAYBE (readcharfun)));
-
- /* If purifying, and string starts with \ newline,
- return zero instead. This is for doc strings
- that we are really going to find in lib-src/DOC.nn.nn */
- if (purify_flag && NILP (Vinternal_doc_file_name) && cancel)
- return Qzero;
-
- Lstream_flush (XLSTREAM (Vread_buffer_stream));
- return
- make_string
- (resizing_buffer_stream_ptr (XLSTREAM (Vread_buffer_stream)),
- Lstream_byte_count (XLSTREAM (Vread_buffer_stream)));
- }
+ /* String */
+ return read_string(readcharfun, '\"', 0, 1);
default:
{
--
Santa Maradona, priez pour moi!