[PATCH] Raw strings, from Python via SXEmacs

Sunday, 16 July 2006

This patch integrates Martin Kuehl’s raw strings into XEmacs 21.5, extending
it to optionally handle Unicode escapes as does Python, and fixing a problem
that arose in lisp-interaction-mode, where raw strings were parsed as normal
strings if you did a C-j after them, becase forward-sexp couldn’t handle
them. 

lisp/ChangeLog addition:

2006-07-16  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* lisp.el (forward-sexp):
	Handle raw strings specially just as we do structures. Fixes
	problems evaluating them in *scratch*. 

man/ChangeLog addition:

2006-07-16  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* lispref/objects.texi (String Type):
	Give details of the raw string syntax, taken from SXEmacs and
	Python. 

src/ChangeLog addition:

2006-07-16  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	Martin Kuehl's raw string syntax, from Python via SXEmacs. 

	* lread.c (read_unicode_escape):
	Refactor this code out from read_escape, since it's now called
	from read_string as well. 
	* lread.c (read_escape):
	Call read_unicode_escape instead of using inline code, 
	* lread.c (read_string):
	Refactor out from read1, provide raw and honor_unicode options. 
	* lread.c (read_raw_string):
	Added, a function that calls read_string with the correct
	arguments for a raw string. 
	* lread.c (read1):
	Pass raw strings to read_raw_string; pass strings to read_string. 

XEmacs Trunk source patch:
Diff command:   cvs -q diff -u
Files affected: src/lread.c man/lispref/objects.texi lisp/lisp.el

Index: lisp/lisp.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/lisp.el,v
retrieving revision 1.5
diff -u -u -r1.5 lisp.el
--- lisp/lisp.el	2001/04/12 18:21:29	1.5
+++ lisp/lisp.el	2006/07/16 17:56:06
＠＠ -60,19 +60,20 ＠＠
   (interactive "_p")
   (or arg (setq arg 1))
   ;; XEmacs: evil hack! The other half of the evil hack below.
-  (if (and (> arg 0) (looking-at "#s("))
-      (goto-char (+ (point) 2)))
+  (if (and (> arg 0) (looking-at "#s(\\|#r[uU]\\{0,1\\}\""))
+    (goto-char (1+ (- (point) (- (match-end 0) (match-beginning 0))))))
   (goto-char (or (scan-sexps (point) arg) (buffer-end arg)))
-  (if (< arg 0) (backward-prefix-chars))
-  ;; XEmacs: evil hack! Skip back over #s so that structures are read
-  ;; properly.  the current cheesified syntax tables just aren't up to
-  ;; this.
-  (if (and (< arg 0)
-	   (eq (char-after (point)) ?\()
-	   (>= (- (point) (point-min)) 2)
-	   (eq (char-after (- (point) 1)) ?s)
-	   (eq (char-after (- (point) 2)) ?#))
-      (goto-char (- (point) 2))))
+  (when (< arg 0) 
+    (backward-prefix-chars)
+    ;; XEmacs: evil hack! Skip back over #[sr] so that structures and raw
+    ;; strings are read properly.  the current cheesified syntax tables just
+    ;; aren't up to this.
+    (let* ((diff (- (point) (point-min)))
+	   (subject (buffer-substring (- (point) (min diff 3))
+				      (1+ (point))))
+	   (matched (string-match "#s(\\|#r[uU]\\{0,1\\}\"" subject)))
+      (if matched
+	(goto-char (1+ (- (point) (- (length subject) matched))))))))

 (defun backward-sexp (&optional arg)
   "Move backward across one balanced expression (sexp).
Index: man/lispref/objects.texi
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/lispref/objects.texi,v
retrieving revision 1.8
diff -u -u -r1.8 objects.texi
--- man/lispref/objects.texi	2006/04/29 14:36:54	1.8
+++ man/lispref/objects.texi	2006/07/16 17:56:11
＠＠ -1079,6 +1079,16 ＠＠
 escape any backslash or double-quote characters in the string with a
 backslash, like this: ＠code{"this \" is an embedded quote"}.

+ An alternative syntax allows insertion of raw backslashes into a
+string, like this: ＠code{#r"this \ is an embedded backslash"}.  In  such
+a string, each character following a backslash is included literally in
+the string, and all backslashes are left in the string.  This means that
+＠code{#r"\""} is a valid string literal with two characters, a backslash
and a
+double-quote.  It also means that a string  with this syntax ＠emph{cannot end
+in a single backslash}.  As with Python, from where this syntax was
+taken, you can specify ＠code{u} or ＠code{U} after the ＠code{#r} to
+specify that interpretation of Unicode escapes should be done. 
+
   The newline character is not special in the read syntax for strings;
 if you write a new line between the double-quotes, it becomes a
 character in the string.  But an escaped newline---one that is preceded
Index: src/lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.78
diff -u -u -r1.78 lread.c
--- src/lread.c	2006/06/03 17:50:54	1.78
+++ src/lread.c	2006/07/16 17:56:12
＠＠ -1670,15 +1670,56 ＠＠

   return val;
 }
+
+/* A Unicode escape, as in C# (though we only permit them in strings
+   and characters, not arbitrarily in the source code.) */
+static Ichar
+read_unicode_escape (Lisp_Object readcharfun, int unicode_hex_count)
+{
+  REGISTER Ichar i = 0, c;
+  REGISTER int count = 0;
+  Lisp_Object lisp_char;
+  while (++count <= unicode_hex_count)
+    {
+      c = readchar (readcharfun);
+      /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
+      if      (c >= '0' && c <= '9')  i = (i << 4) +
(c - '0');
+      else if (c >= 'a' && c <= 'f')  i = (i << 4) +
(c - 'a') + 10;
+      else if (c >= 'A' && c <= 'F')  i = (i << 4) +
(c - 'A') + 10;
+      else
+	{
+	  syntax_error ("Non-hex digit used for Unicode escape",
+			make_char (c));
+	  break;
+	}
+    }
+
+  lisp_char = Funicode_to_char(make_int(i), Qnil);
+
+  if (EQ(Qnil, lisp_char))
+    {
+      /* This is ugly and horrible and trashes the user's data, but
+	 it's what unicode.c does. In the future, unicode-to-char
+	 should not return nil.  */
+#ifdef MULE
+      i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
+#else
+      i = '~';
+#endif
+      return i;
+    }
+  else
+    {
+      return XCHAR(lisp_char);
+    }
+}
+

 static Ichar
 read_escape (Lisp_Object readcharfun)
 {
   /* This function can GC */
   Ichar c = readchar (readcharfun);
-  /* \u allows up to four hex digits, \U up to eight. Default to the
-     behaviour for \u, and change this value in the case that \U is seen. */
-  int unicode_hex_count = 4;

   if (c < 0)
     signal_error (Qend_of_file, 0, READCHARFUN_MAYBE (readcharfun));
＠＠ -1797,50 +1838,11 ＠＠
       }
     case 'U':
       /* Post-Unicode-2.0: Up to eight hex chars */
-      unicode_hex_count = 8;
+      return read_unicode_escape(readcharfun, 8);
     case 'u':
+      /* Unicode-2.0 and before; four hex chars. */
+      return read_unicode_escape(readcharfun, 4);

-      /* A Unicode escape, as in C# (though we only permit them in strings
-	 and characters, not arbitrarily in the source code.) */
-      {
-	REGISTER Ichar i = 0;
-	REGISTER int count = 0;
-	Lisp_Object lisp_char;
-	while (++count <= unicode_hex_count)
-	  {
-	    c = readchar (readcharfun);
-	    /* Remember, can't use isdigit(), isalpha() etc. on Ichars */
-	    if      (c >= '0' && c <= '9')  i = (i << 4) +
(c - '0');
-	    else if (c >= 'a' && c <= 'f')  i = (i << 4) +
(c - 'a') + 10;
-            else if (c >= 'A' && c <= 'F')  i = (i <<
4) + (c - 'A') + 10;
-	    else
-	      {
-		syntax_error ("Non-hex digit used for Unicode escape",
-			      make_char (c));
-		break;
-	      }
-	  }
-
-	lisp_char = Funicode_to_char(make_int(i), Qnil);
-
-	if (EQ(Qnil, lisp_char))
-	  {
-	    /* This is ugly and horrible and trashes the user's data, but
-	       it's what unicode.c does. In the future, unicode-to-char
-	       should not return nil.  */
-#ifdef MULE
-	    i = make_ichar (Vcharset_japanese_jisx0208, 34 + 128, 46 + 128);
-#else
-	    i = '~';
-#endif
-            return i;
-	  }
-	else
-	  {
-	    return XCHAR(lisp_char);
-	  }
-      }
-
     default:
 	return c;
     }
＠＠ -2270,6 +2272,113 ＠＠
 }
 #endif

+static Lisp_Object
+read_string (Lisp_Object readcharfun, Ichar delim, int raw, 
+	     int honor_unicode)
+{
+#ifdef I18N3
+  /* #### If the input stream is translating, then the string
+     should be marked as translatable by setting its
+     `string-translatable' property to t.  .el and .elc files
+     normally are translating input streams.  See Fgettext()
+     and print_internal(). */
+#endif
+  Ichar c;
+  int cancel = 0;
+
+  Lstream_rewind(XLSTREAM(Vread_buffer_stream));
+  while ((c = readchar(readcharfun)) >= 0 && c != delim)
+    {
+    if (c == '\\') 
+      {
+	if (raw) 
+	  {
+	    c = readchar(readcharfun);
+	    if (honor_unicode && ('u' == c || 'U' == c))
+	      {
+		c = read_unicode_escape(readcharfun,
+					'U' == c ? 8 : 4);
+	      }
+	    else
+	      {
+		/* For raw strings, insert the
+		   backslash and the next char, */
+		Lstream_put_ichar(XLSTREAM
+				  (Vread_buffer_stream),
+				  '\\');
+	      }
+	  } 
+	else
+	  /* otherwise, backslash escapes the next char. */
+	  c = read_escape(readcharfun);
+      }
+    /* c is -1 if \ newline has just been seen */
+    if (c == -1) 
+      {
+	if (Lstream_byte_count
+	  (XLSTREAM(Vread_buffer_stream)) ==
+	  0)
+	  cancel = 1;
+      } 
+    else
+      Lstream_put_ichar(XLSTREAM
+			 (Vread_buffer_stream),
+			 c);
+    QUIT;
+    }
+  if (c < 0)
+    return Fsignal(Qend_of_file,
+		   list1(READCHARFUN_MAYBE(readcharfun)));
+
+  /* If purifying, and string starts with \ newline,
+     return zero instead.  This is for doc strings
+     that we are really going to find in lib-src/DOC.nn.nn  */
+  if (purify_flag && NILP(Vinternal_doc_file_name)
+      && cancel)
+    return Qzero;
+
+  Lstream_flush(XLSTREAM(Vread_buffer_stream));
+  return make_string(resizing_buffer_stream_ptr
+		     (XLSTREAM(Vread_buffer_stream)),
+		     Lstream_byte_count(XLSTREAM(Vread_buffer_stream)));
+}
+
+static Lisp_Object
+read_raw_string (Lisp_Object readcharfun)
+{
+  Ichar c;
+  Ichar permit_unicode = 0; 
+
+  do {
+    c = reader_nextchar(readcharfun);
+    switch (c) {
+      /* #r:engine"my sexy raw string" -- raw string w/ flags*/
+      /* case ':': */
+      /* #ru"Hi there\u20AC \U000020AC" -- raw string, honouring Unicode. */
+    case 'u':
+    case 'U':
+      permit_unicode = c; 
+      continue;
+
+      /* #r"my raw string" -- raw string */
+    case '\"':
+      return read_string(readcharfun, '\"', 1, permit_unicode);
+      /* invalid syntax */
+    default:
+      {
+	if (permit_unicode)
+	  {
+	    unreadchar(readcharfun, permit_unicode);
+	  }
+	unreadchar(readcharfun, c);
+	return Fsignal(Qinvalid_read_syntax,
+		       list1(build_string
+			     ("unrecognized raw string syntax")));
+      }
+    }
+  } while (1);
+}
+
 /* Read the next Lisp object from the stream READCHARFUN and return it.
    If the return value is a cons whose car is Qunbound, then read1()
    encountered a misplaced token (e.g. a right bracket, right paren,
＠＠ -2509,6 +2618,8 ＠＠
 	  case 'x': return read_integer (readcharfun, 16);
             /* #b010 => 2 -- binary constant syntax */
 	  case 'b': return read_integer (readcharfun, 2);
+	    /* #r"raw\stringt" -- raw string syntax */
+	  case 'r': return read_raw_string(readcharfun);
             /* #s(foobar key1 val1 key2 val2) -- structure syntax */
 	  case 's': return read_structure (readcharfun);
 	  case '<':
＠＠ -2654,48 +2765,8 ＠＠
       }

     case '\"':
-      {
-	/* String */
-#ifdef I18N3
-	/* #### If the input stream is translating, then the string
-	   should be marked as translatable by setting its
-	   `string-translatable' property to t.  .el and .elc files
-	   normally are translating input streams.  See Fgettext()
-	   and print_internal(). */
-#endif
-	int cancel = 0;
-
-	Lstream_rewind (XLSTREAM (Vread_buffer_stream));
-	while ((c = readchar (readcharfun)) >= 0
-	       && c != '\"')
-	  {
-	    if (c == '\\')
-	      c = read_escape (readcharfun);
-	    /* c is -1 if \ newline has just been seen */
-	    if (c == -1)
-	      {
-		if (Lstream_byte_count (XLSTREAM (Vread_buffer_stream)) == 0)
-		  cancel = 1;
-	      }
-	    else
-	      Lstream_put_ichar (XLSTREAM (Vread_buffer_stream), c);
-	    QUIT;
-	  }
-	if (c < 0)
-	  return Fsignal (Qend_of_file, list1 (READCHARFUN_MAYBE (readcharfun)));
-
-	/* If purifying, and string starts with \ newline,
-	   return zero instead.  This is for doc strings
-	   that we are really going to find in lib-src/DOC.nn.nn  */
-	if (purify_flag && NILP (Vinternal_doc_file_name) && cancel)
-	  return Qzero;
-
-	Lstream_flush (XLSTREAM (Vread_buffer_stream));
-	return
-	  make_string
-	  (resizing_buffer_stream_ptr (XLSTREAM (Vread_buffer_stream)),
-	   Lstream_byte_count (XLSTREAM (Vread_buffer_stream)));
-      }
+      /* String */
+      return read_string(readcharfun, '\"', 0, 1);

     default:
       {

-- 
Santa Maradona, priez pour moi!

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

[PATCH] Raw strings, from Python via SXEmacs