unicode-internal-commit: some fixes/cleanups of Unicode code

Ben Wing ben at xemacs.org
Tue Mar 23 08:18:51 EDT 2010


changeset:   5184:d04006e313df
branch:      ben-unicode-internal
user:        Ben Wing <ben at xemacs.org>
date:        Sun Feb 28 04:47:31 2010 -0600
files:       src/ChangeLog src/text.h src/unicode.c src/unicode.h
description:
some fixes/cleanups of Unicode code

-------------------- ChangeLog entries follow: --------------------

src/ChangeLog addition:

2010-02-28  Ben Wing  <ben at xemacs.org>

	* text.h:
	* text.h (ibyte_first_byte_p):
	* unicode.c:
	* unicode.c (encode_unicode_char):
	* unicode.c (unicode_convert):
	* unicode.h:
	Add non-Mule defn of ibyte_first_byte_p(), now needed.

	Move some basic Unicode defns from text.h and unicode.c
	to unicode.h.	Add unicode_error_octet_code_p() to check if
	a particular Unicode codepoint is an error-octet codepoint.

	Use unicode_error_octet_code_p() and UNICODE_OFFICIAL_MAX in
	place of hard-coded values.  Simplify logic that handles
	error-octets.


diff -r 350284c64e7e -r d04006e313df src/ChangeLog
--- a/src/ChangeLog	Sun Feb 28 04:34:19 2010 -0600
+++ b/src/ChangeLog	Sun Feb 28 04:47:31 2010 -0600
@@ -1,3 +1,21 @@
+2010-02-28  Ben Wing  <ben at xemacs.org>
+
+	* text.h:
+	* text.h (ibyte_first_byte_p):
+	* unicode.c:
+	* unicode.c (encode_unicode_char):
+	* unicode.c (unicode_convert):
+	* unicode.h:
+	Add non-Mule defn of ibyte_first_byte_p(), now needed.
+
+	Move some basic Unicode defns from text.h and unicode.c
+	to unicode.h.	Add unicode_error_octet_code_p() to check if
+	a particular Unicode codepoint is an error-octet codepoint.
+
+	Use unicode_error_octet_code_p() and UNICODE_OFFICIAL_MAX in
+	place of hard-coded values.  Simplify logic that handles
+	error-octets.
+
 2010-02-21  Vin Shelton  <acs at xemacs.org>
 
 	* events.c (event_pixel_translation): Simplify assertion for
diff -r 350284c64e7e -r d04006e313df src/text.h
--- a/src/text.h	Sun Feb 28 04:34:19 2010 -0600
+++ b/src/text.h	Sun Feb 28 04:47:31 2010 -0600
@@ -215,6 +215,7 @@
 #ifndef MULE
 
 #define rep_bytes_by_first_byte(fb) 1
+#define ibyte_first_byte_p(ptr) 1
 #define byte_ascii_p(byte) 1
 #define MAX_ICHAR_LEN 1
 
@@ -613,19 +614,7 @@
 /*                         Unicode conversion                           */
 /************************************************************************/
 
-/* Where to place the 256 private Unicode codepoints used for encoding
-   erroneous octets in a UTF-8 or UTF-16 file.  Note: This MUST be below
-   the space used for encoding unknown charset codepoints, which currently
-   starts at 0x800000.  See charset_codepoint_to_private_unicode(). */
-#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
-#define UNICODE_ERROR_OCTET_RANGE_END (UNICODE_ERROR_OCTET_RANGE_START + 0xFF)
-
-#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
-#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
-#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
-
 typedef int (*charset_pred) (Lisp_Object);
-
 
 int old_mule_ichar_to_unicode (Ichar chr, enum converr fail);
 Ichar old_mule_unicode_to_ichar (int code, Lisp_Object precedence_array,
diff -r 350284c64e7e -r d04006e313df src/unicode.c
--- a/src/unicode.c	Sun Feb 28 04:34:19 2010 -0600
+++ b/src/unicode.c	Sun Feb 28 04:47:31 2010 -0600
@@ -237,24 +237,6 @@
 Lisp_Object Vcharset_descr;
 #endif
 #endif /* MULE */
-
-/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
-   algorithm. 
- 
-   (They also give another, really verbose one, as part of their explanation
-   of the various planes of the encoding, but we won't use that.) */
- 
-#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
-#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
-
-#define utf_16_surrogates_to_code(lead, trail) \
-  (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
-
-#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do {	\
-    int __ctu16s_code = (codepoint);				\
-    lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10);		\
-    trail = 0xDC00 + (__ctu16s_code & 0x3FF);			\
-} while (0)
 
 #ifdef MULE 
 
@@ -2462,19 +2444,21 @@
                      int write_error_characters_as_such)
 {
   ASSERT_VALID_UNICODE_CODEPOINT (code);
+
+  if (write_error_characters_as_such && 
+      unicode_error_octet_code_p (code))
+    {
+      Dynarr_add (dst, unicode_error_octet_code_to_octet (code));
+      return;
+    }
+  
   switch (type)
     {
     case UNICODE_UTF_16:
       /* Handle surrogates */
       if (code < 0x10000)
 	add_16_bit_char (code, dst, little_endian);
-      else if (write_error_characters_as_such && 
-	       code >= UNICODE_ERROR_OCTET_RANGE_START &&
-	       code <= UNICODE_ERROR_OCTET_RANGE_END)
-	{
-	  Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
-	}
-      else if (code < 0x110000)
+      else if (code <= UNICODE_OFFICIAL_MAX)
 	{
 	  int first, second;
 	  
@@ -2492,42 +2476,21 @@
 
     case UNICODE_UCS_4:
     case UNICODE_UTF_32:
+      /* We generate and accept incorrect sequences here, which is okay,
+	 in the interest of preservation of the user's data.  */
       if (little_endian)
 	{
-          if (write_error_characters_as_such && 
-              code >= UNICODE_ERROR_OCTET_RANGE_START &&
-              code <= UNICODE_ERROR_OCTET_RANGE_END)
-            {
-              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
-            }
-          else
-            {
-              /* We generate and accept incorrect sequences here, which is
-                 okay, in the interest of preservation of the user's
-                 data.  */
-              Dynarr_add (dst, (unsigned char) (code & 255));
-              Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-              Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
-              Dynarr_add (dst, (unsigned char) (code >> 24));
-            }
+	  Dynarr_add (dst, (unsigned char) (code & 255));
+	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+	  Dynarr_add (dst, (unsigned char) (code >> 24));
 	}
       else
 	{
-          if (write_error_characters_as_such && 
-              code >= UNICODE_ERROR_OCTET_RANGE_START &&
-              code <= UNICODE_ERROR_OCTET_RANGE_END)
-            {
-              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
-            }
-          else
-            {
-              /* We generate and accept incorrect sequences here, which is okay,
-                 in the interest of preservation of the user's data.  */
-              Dynarr_add (dst, (unsigned char) (code >> 24));
-              Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
-              Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-              Dynarr_add (dst, (unsigned char) (code & 255));
-            }
+	  Dynarr_add (dst, (unsigned char) (code >> 24));
+	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+	  Dynarr_add (dst, (unsigned char) (code & 255));
 	}
       break;
 
@@ -2545,22 +2508,7 @@
 	    if (code <= 0x7ff) bytes = 2;
 	    else if (code <= 0xffff) bytes = 3;
 	    else if (code <= 0x1fffff) bytes = 4;
-	    else if (code <= 0x3ffffff)
-             {
-#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
-          && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
-#error "This code needs to be rewritten. " 
-#endif
-	       if (write_error_characters_as_such && 
-		   code >= UNICODE_ERROR_OCTET_RANGE_START &&
-		   code <= UNICODE_ERROR_OCTET_RANGE_END)
-		 {
-		   Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
-		   break;
-		 }
-	       else
-		 bytes = 5;
-	     }
+	    else if (code <= 0x3ffffff) bytes = 5;
 	    else bytes = 6;
 
 	    Dynarr_add_many (dst, 0, bytes);
@@ -2657,9 +2605,9 @@
                   /* counter != 0 */
                   if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
                     {
-                      indicate_invalid_utf_8(indicated_length, 
-                                             counter, 
-                                             ch, dst, data, ignore_bom);
+                      indicate_invalid_utf_8 (indicated_length, 
+					      counter, 
+					      ch, dst, data, ignore_bom);
                       if (c & 0x80)
                         {
                           DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
@@ -2685,12 +2633,13 @@
                           if ((ch < 0x80) ||
                               ((ch < 0x800) && indicated_length > 2) || 
                               ((ch < 0x10000) && indicated_length > 3) || 
-                              valid_utf_16_surrogate(ch) || (ch > 0x110000))
+                              valid_utf_16_surrogate (ch) ||
+			      (ch > UNICODE_OFFICIAL_MAX))
                             {
-                              indicate_invalid_utf_8(indicated_length, 
-                                                     counter, 
-                                                     ch, dst, data,
-                                                     ignore_bom);
+                              indicate_invalid_utf_8 (indicated_length, 
+						      counter, 
+						      ch, dst, data,
+						      ignore_bom);
                             }
                           else
                             {
@@ -2784,7 +2733,7 @@
 	      counter += 8;
 	      if (counter == 32)
 		{
-		  if (ch > 0x10ffff)
+		  if (ch > UNICODE_OFFICIAL_MAX)
 		    {
                       /* ch is not a legal Unicode character. We're fine
                          with that in UCS-4, though not in UTF-32. */
@@ -2796,24 +2745,24 @@
                       else if (little_endian)
                         {
                           DECODE_ERROR_OCTET (ch & 0xFF, dst, data, 
-                                            ignore_bom);
+					      ignore_bom);
                           DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
-                                            ignore_bom);
+					      ignore_bom);
                           DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
-                                            ignore_bom);
+					      ignore_bom);
                           DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
-                                            ignore_bom);
+					      ignore_bom);
                         }
                       else
                         {
                           DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
-                                            ignore_bom);
+					      ignore_bom);
                           DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
-                                            ignore_bom);
+					      ignore_bom);
                           DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
-                                            ignore_bom);
+					      ignore_bom);
                           DECODE_ERROR_OCTET (ch & 0xFF, dst, data, 
-                                            ignore_bom);
+					      ignore_bom);
                         }
 		    }
                   else
@@ -2886,7 +2835,7 @@
                                           ignore_bom); 
                     }
                 }
-              else assert(0);
+              else assert (0);
               break;
             }
           ch = 0;
diff -r 350284c64e7e -r d04006e313df src/unicode.h
--- a/src/unicode.h	Sun Feb 28 04:34:19 2010 -0600
+++ b/src/unicode.h	Sun Feb 28 04:47:31 2010 -0600
@@ -23,6 +23,10 @@
 #define INCLUDED_unicode_h_
 
 #ifdef MULE
+
+/************************************************************************/
+/*                          Precedence arrays                           */
+/************************************************************************/
 
 struct precedence_array
 {
@@ -51,4 +55,53 @@
 
 #endif /* MULE */
 
+/************************************************************************/
+/*                    Unicode error octet characters                    */
+/************************************************************************/
+
+/* Where to place the 256 private Unicode codepoints used for encoding
+   erroneous octets in a UTF-8 or UTF-16 file.  Note: This MUST be below
+   the space used for encoding unknown charset codepoints, which currently
+   starts at 0x800000.  See charset_codepoint_to_private_unicode(). */
+#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
+#define UNICODE_ERROR_OCTET_RANGE_END (UNICODE_ERROR_OCTET_RANGE_START + 0xFF)
+
+DECLARE_INLINE_HEADER (
+int
+unicode_error_octet_code_p (int code)
+)
+{
+  return (code >= UNICODE_ERROR_OCTET_RANGE_START &&
+	  code <= UNICODE_ERROR_OCTET_RANGE_END);
+}
+
+#define unicode_error_octet_code_to_octet(code) \
+  ((unsigned char) ((code) & 0xFF))
+
+/************************************************************************/
+/*                          UTF-16 properties                           */
+/************************************************************************/
+
+#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
+#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
+#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
+
+/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
+   algorithm. 
+ 
+   (They also give another, really verbose one, as part of their explanation
+   of the various planes of the encoding, but we won't use that.) */
+ 
+#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
+#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
+
+#define utf_16_surrogates_to_code(lead, trail) \
+  (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
+
+#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do {	\
+    int __ctu16s_code = (codepoint);				\
+    lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10);		\
+    trail = 0xDC00 + (__ctu16s_code & 0x3FF);			\
+} while (0)
+
 #endif /* INCLUDED_unicode_h_ */



More information about the XEmacs-Patches mailing list