[COMMIT] Correct the dumped information for the Unicode JIT infrastructure

Wednesday, 14 November 2007

APPROVE COMMIT

The chief issue that this fixes is that the first you go beyond the first Mule
JIT charset in allocating Unicode code points, you get an error. The next
time it succeeds. This change makes it work the first time. 

NOTE: This patch has been committed.

src/ChangeLog addition:

2007-11-14  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* lread.c (read_unicode_escape):
	Correct the range check for Unicode characters specified with
	source-level escapes. 
	* unicode.c:
	* unicode.c (unicode_to_ichar):
	* unicode.c (coding_system_type_create_unicode):
	Correct the dump behaviour for just-in-time Unicode code
	points. Update the docstring for #'unicode-to-char to indicate
	that code points will run out above around 400,000 in a session. 

lisp/ChangeLog addition:

2007-11-14  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* unicode.el (unicode-error-default-translation-table): 
	* unicode.el (unicode-error-sequence-regexp-range):
	* unicode.el (frob-unicode-errors-region):
	Make these variables and the single function available to
	make-docfile, by moving them to the start of the line. This
	conflicts with normal indentation of Lisp, unfortunately. 

XEmacs Trunk source patch:
Diff command:   cvs -q diff -Nu
Files affected: lisp/unicode.el
===================================================================
RCS src/unicode.c
===================================================================
RCS src/lread.c
===================================================================
RCS

Index: src/lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.82
diff -u -u -r1.82 lread.c
--- src/lread.c	2007/08/04 20:00:24	1.82
+++ src/lread.c	2007/11/14 19:33:48
＠＠ -1694,7 +1694,7 ＠＠
 	}
     }

-  if (i > 0x110000 || i < 0)
+  if (i >= 0x110000 || i < 0)
     {
       syntax_error ("Not a Unicode code point", make_int(i));
     }
Index: src/unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.38
diff -u -u -r1.38 unicode.c
--- src/unicode.c	2007/08/04 20:00:24	1.38
+++ src/unicode.c	2007/11/14 19:33:48
＠＠ -336,6 +336,11 ＠＠
 Lisp_Object Qlast_allocated_character;
 Lisp_Object Qccl_encode_to_ucs_2;

+Lisp_Object Vnumber_of_jit_charsets;
+Lisp_Object Vlast_jit_charset_final;
+Lisp_Object Vcharset_descr;
+
+

 /************************************************************************/
 /*                        Unicode implementation                        */
＠＠ -1080,8 +1085,6 ＠＠
   int code_levels;
   int i;
   int n = Dynarr_length (charsets);
-  static int number_of_jit_charsets;
-  static Ascbyte last_jit_charset_final;

   type_checking_assert (code >= 0);
   /* This shortcut depends on the representation of an Ichar, see text.c.
＠＠ -1124,33 +1127,21 ＠＠
 	  (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
 	{
 	  Ibyte setname[32]; 
-	  Lisp_Object charset_descr = build_string
-	    ("Mule charset for otherwise unknown Unicode code points.");
-
-	  struct gcpro gcpro1;
+	  int number_of_jit_charsets = XINT (Vnumber_of_jit_charsets);
+	  Ascbyte last_jit_charset_final = XCHAR (Vlast_jit_charset_final);

-	  if ('\0' == last_jit_charset_final)
-	    {
-	      /* This final byte shit is, umm, not that cool. */
-	      last_jit_charset_final = 0x30;
-	    }
+	  /* This final byte shit is, umm, not that cool. */
+	  assert (last_jit_charset_final >= 0x30);

 	  /* Assertion added partly because our Win32 layer doesn't
 	     support snprintf; with this, we're sure it won't overflow
 	     the buffer.  */
 	  assert(100 > number_of_jit_charsets);
-
-	  qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets++);

-	  /* Aside: GCPROing here would be overkill according to the FSF's
-	     philosophy. make-charset cannot currently GC, but is intended
-	     to be called from Lisp, with its arguments protected by the
-	     Lisp reader. We GCPRO in case it GCs in the future and no-one
-	     checks all the C callers.  */
+	  qxesprintf(setname, "jit-ucs-charset-%d", number_of_jit_charsets);

-	  GCPRO1 (charset_descr);
 	  Vcurrent_jit_charset = Fmake_charset 
-	    (intern((const CIbyte *)setname), charset_descr, 
+	    (intern((const CIbyte *)setname), Vcharset_descr, 
 	     /* Set encode-as-utf-8 to t, to have this character set written
 		using UTF-8 escapes in escape-quoted and ctext. This
 		sidesteps the fact that our internal character -> Unicode
＠＠ -1159,12 +1150,16 ＠＠
 		     nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
 				   Qdimension, make_int(2)),
 			     list6(Qregistries, Qunicode_registries,
-				   Qfinal, make_char(last_jit_charset_final++),
+				   Qfinal, make_char(last_jit_charset_final),
 				   /* This CCL program is initialised in
 				      unicode.el. */
 				   Qccl_program, Qccl_encode_to_ucs_2))));
-	  UNGCPRO;

+	  /* Record for the Unicode infrastructure that we've created
+	     this character set.  */
+	  Vnumber_of_jit_charsets = make_int (number_of_jit_charsets + 1);
+	  Vlast_jit_charset_final = make_char (last_jit_charset_final + 1);
+
 	  i = get_free_codepoint(Vcurrent_jit_charset);
 	} 

＠＠ -1421,10 +1416,15 ＠＠
 If the CODE would not otherwise be converted to an XEmacs character, and the
 list of character sets to be consulted is nil or the default, a new XEmacs
 character will be created for it in one of the `jit-ucs-charset' Mule
-character sets, and that character will be returned.  There is scope for
-tens of thousands of separate Unicode code points in every session using
-this technique, so despite XEmacs' internal encoding not being based on
-Unicode, your data won't be trashed.
+character sets, and that character will be returned.  
+
+This is limited to around 400,000 characters per XEmacs session, though, so
+while normal usage will not be problematic, things like:
+
+\(dotimes (i #x110000) (decode-char 'ucs i))
+
+will eventually error.  The long-term solution to this is Unicode as an
+internal encoding. 
 */
        (code, USED_IF_MULE (charsets)))
 {
＠＠ -2862,6 +2862,14 ＠＠
 void
 coding_system_type_create_unicode (void)
 {
+  staticpro (&Vnumber_of_jit_charsets);
+  Vnumber_of_jit_charsets = make_int (0);
+  staticpro (&Vlast_jit_charset_final);
+  Vlast_jit_charset_final = make_char (0x30);
+  staticpro (&Vcharset_descr);
+  Vcharset_descr
+    = build_string ("Mule charset for otherwise unknown Unicode code
points.");
+
   INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (unicode,
"unicode-coding-system-p");
   CODING_SYSTEM_HAS_METHOD (unicode, print);
   CODING_SYSTEM_HAS_METHOD (unicode, convert);
Index: lisp/unicode.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/unicode.el,v
retrieving revision 1.26
diff -u -u -r1.26 unicode.el
--- lisp/unicode.el	2007/10/13 14:08:30	1.26
+++ lisp/unicode.el	2007/11/14 19:33:48
＠＠ -494,36 +494,40 ＠＠
                           (char-syntax ascii-or-latin-1))
                          syntax-table))

-  ;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
-  ;; characters starting at U+200000 (which isn't a valid Unicode code
-  ;; point). Make them available to user code. 
-  (defvar unicode-error-default-translation-table
-    (loop 
-      with char-table = (make-char-table 'char)
-      for i from ?\x00 to ?\xFF
-      do
-      (put-char-table (aref
-		       ;; #xd800 is the first leading surrogate;
-		       ;; trailing surrogates must be in the range
-		       ;; #xdc00-#xdfff. These examples are not, so we
-		       ;; intentionally provoke an error sequence.
-		       (decode-coding-string (format "\xd8\x00\x00%c" i)
-					     'utf-16-be)
-		       3)
-		      i
-                      char-table)
-      finally return char-table)
-    "Translation table mapping Unicode error sequences to Latin-1 chars.
+;; *Sigh*, declarations needs to be at the start of the line to be picked up
+;; by make-docfile. Not so much an issue with ccl-encode-to-ucs-2, which we
+;; don't necessarily want to advertise, but the following are important.
+
+;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
+;; characters starting at U+200000 (which isn't a valid Unicode code
+;; point). Make them available to user code. 
+(defvar unicode-error-default-translation-table
+  (loop 
+    with char-table = (make-char-table 'char)
+    for i from ?\x00 to ?\xFF
+    do
+    (put-char-table (aref
+                     ;; #xd800 is the first leading surrogate;
+                     ;; trailing surrogates must be in the range
+                     ;; #xdc00-#xdfff. These examples are not, so we
+                     ;; intentionally provoke an error sequence.
+                     (decode-coding-string (format "\xd8\x00\x00%c" i)
+                                           'utf-16-be)
+                     3)
+                    i
+                    char-table)
+    finally return char-table)
+  "Translation table mapping Unicode error sequences to Latin-1 chars.

 To transform XEmacs Unicode error sequences to the Latin-1 characters that
 correspond to the octets on disk, you can use this variable.  ")

-  (defvar unicode-error-sequence-regexp-range
-    (format "%c%c-%c"
-            (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
-            (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
-            (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
-    "Regular expression range to match Unicode error sequences in XEmacs.
+(defvar unicode-error-sequence-regexp-range
+  (format "%c%c-%c"
+          (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
+          (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
+          (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
+  "Regular expression range to match Unicode error sequences in XEmacs.

 Invalid Unicode sequences on input are represented as XEmacs
 characters with values stored as the keys in
＠＠ -559,14 +563,14 ＠＠
 	      nil
 	      (format "Could not find char ?\\x%x in buffer" i))))

-  (defun frob-unicode-errors-region (frob-function begin end &optional buffer)
-    "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
+(defun frob-unicode-errors-region (frob-function begin end &optional buffer)
+  "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.

 Optional argument BUFFER specifies the buffer that should be examined for
 such sequences.  "
-    (check-argument-type #'functionp frob-function)
-    (check-argument-range begin (point-min buffer) (point-max buffer))
-    (check-argument-range end (point-min buffer) (point-max buffer))
+  (check-argument-type #'functionp frob-function)
+  (check-argument-range begin (point-min buffer) (point-max buffer))
+  (check-argument-range end (point-min buffer) (point-max buffer))
     (save-excursion
       (save-restriction
 	(if buffer (set-buffer buffer))

-- 
On the quay of the little Black Sea port, where the rescued pair came once
more into contact with civilization, Dobrinton was bitten by a dog which was
assumed to be mad, though it may only have been indiscriminating. (Saki)

_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://calypso.tux.org/cgi-bin/mailman/listinfo/xemacs-patches

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003