unicode-internal-commit: some fixes/cleanups of Unicode code
Ben Wing
ben at xemacs.org
Tue Mar 23 08:18:51 EDT 2010
changeset: 5184:d04006e313df
branch: ben-unicode-internal
user: Ben Wing <ben at xemacs.org>
date: Sun Feb 28 04:47:31 2010 -0600
files: src/ChangeLog src/text.h src/unicode.c src/unicode.h
description:
some fixes/cleanups of Unicode code
-------------------- ChangeLog entries follow: --------------------
src/ChangeLog addition:
2010-02-28 Ben Wing <ben at xemacs.org>
* text.h:
* text.h (ibyte_first_byte_p):
* unicode.c:
* unicode.c (encode_unicode_char):
* unicode.c (unicode_convert):
* unicode.h:
Add non-Mule defn of ibyte_first_byte_p(), now needed.
Move some basic Unicode defns from text.h and unicode.c
to unicode.h. Add unicode_error_octet_code_p() to check if
a particular Unicode codepoint is an error-octet codepoint.
Use unicode_error_octet_code_p() and UNICODE_OFFICIAL_MAX in
place of hard-coded values. Simplify logic that handles
error-octets.
diff -r 350284c64e7e -r d04006e313df src/ChangeLog
--- a/src/ChangeLog Sun Feb 28 04:34:19 2010 -0600
+++ b/src/ChangeLog Sun Feb 28 04:47:31 2010 -0600
@@ -1,3 +1,21 @@
+2010-02-28 Ben Wing <ben at xemacs.org>
+
+ * text.h:
+ * text.h (ibyte_first_byte_p):
+ * unicode.c:
+ * unicode.c (encode_unicode_char):
+ * unicode.c (unicode_convert):
+ * unicode.h:
+ Add non-Mule defn of ibyte_first_byte_p(), now needed.
+
+ Move some basic Unicode defns from text.h and unicode.c
+ to unicode.h. Add unicode_error_octet_code_p() to check if
+ a particular Unicode codepoint is an error-octet codepoint.
+
+ Use unicode_error_octet_code_p() and UNICODE_OFFICIAL_MAX in
+ place of hard-coded values. Simplify logic that handles
+ error-octets.
+
2010-02-21 Vin Shelton <acs at xemacs.org>
* events.c (event_pixel_translation): Simplify assertion for
diff -r 350284c64e7e -r d04006e313df src/text.h
--- a/src/text.h Sun Feb 28 04:34:19 2010 -0600
+++ b/src/text.h Sun Feb 28 04:47:31 2010 -0600
@@ -215,6 +215,7 @@
#ifndef MULE
#define rep_bytes_by_first_byte(fb) 1
+#define ibyte_first_byte_p(ptr) 1
#define byte_ascii_p(byte) 1
#define MAX_ICHAR_LEN 1
@@ -613,19 +614,7 @@
/* Unicode conversion */
/************************************************************************/
-/* Where to place the 256 private Unicode codepoints used for encoding
- erroneous octets in a UTF-8 or UTF-16 file. Note: This MUST be below
- the space used for encoding unknown charset codepoints, which currently
- starts at 0x800000. See charset_codepoint_to_private_unicode(). */
-#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
-#define UNICODE_ERROR_OCTET_RANGE_END (UNICODE_ERROR_OCTET_RANGE_START + 0xFF)
-
-#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
-#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
-#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
-
typedef int (*charset_pred) (Lisp_Object);
-
int old_mule_ichar_to_unicode (Ichar chr, enum converr fail);
Ichar old_mule_unicode_to_ichar (int code, Lisp_Object precedence_array,
diff -r 350284c64e7e -r d04006e313df src/unicode.c
--- a/src/unicode.c Sun Feb 28 04:34:19 2010 -0600
+++ b/src/unicode.c Sun Feb 28 04:47:31 2010 -0600
@@ -237,24 +237,6 @@
Lisp_Object Vcharset_descr;
#endif
#endif /* MULE */
-
-/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
- algorithm.
-
- (They also give another, really verbose one, as part of their explanation
- of the various planes of the encoding, but we won't use that.) */
-
-#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
-#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
-
-#define utf_16_surrogates_to_code(lead, trail) \
- (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
-
-#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \
- int __ctu16s_code = (codepoint); \
- lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \
- trail = 0xDC00 + (__ctu16s_code & 0x3FF); \
-} while (0)
#ifdef MULE
@@ -2462,19 +2444,21 @@
int write_error_characters_as_such)
{
ASSERT_VALID_UNICODE_CODEPOINT (code);
+
+ if (write_error_characters_as_such &&
+ unicode_error_octet_code_p (code))
+ {
+ Dynarr_add (dst, unicode_error_octet_code_to_octet (code));
+ return;
+ }
+
switch (type)
{
case UNICODE_UTF_16:
/* Handle surrogates */
if (code < 0x10000)
add_16_bit_char (code, dst, little_endian);
- else if (write_error_characters_as_such &&
- code >= UNICODE_ERROR_OCTET_RANGE_START &&
- code <= UNICODE_ERROR_OCTET_RANGE_END)
- {
- Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
- }
- else if (code < 0x110000)
+ else if (code <= UNICODE_OFFICIAL_MAX)
{
int first, second;
@@ -2492,42 +2476,21 @@
case UNICODE_UCS_4:
case UNICODE_UTF_32:
+ /* We generate and accept incorrect sequences here, which is okay,
+ in the interest of preservation of the user's data. */
if (little_endian)
{
- if (write_error_characters_as_such &&
- code >= UNICODE_ERROR_OCTET_RANGE_START &&
- code <= UNICODE_ERROR_OCTET_RANGE_END)
- {
- Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
- }
- else
- {
- /* We generate and accept incorrect sequences here, which is
- okay, in the interest of preservation of the user's
- data. */
- Dynarr_add (dst, (unsigned char) (code & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
- Dynarr_add (dst, (unsigned char) (code >> 24));
- }
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+ Dynarr_add (dst, (unsigned char) (code >> 24));
}
else
{
- if (write_error_characters_as_such &&
- code >= UNICODE_ERROR_OCTET_RANGE_START &&
- code <= UNICODE_ERROR_OCTET_RANGE_END)
- {
- Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
- }
- else
- {
- /* We generate and accept incorrect sequences here, which is okay,
- in the interest of preservation of the user's data. */
- Dynarr_add (dst, (unsigned char) (code >> 24));
- Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (code & 255));
- }
+ Dynarr_add (dst, (unsigned char) (code >> 24));
+ Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (code & 255));
}
break;
@@ -2545,22 +2508,7 @@
if (code <= 0x7ff) bytes = 2;
else if (code <= 0xffff) bytes = 3;
else if (code <= 0x1fffff) bytes = 4;
- else if (code <= 0x3ffffff)
- {
-#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
- && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
-#error "This code needs to be rewritten. "
-#endif
- if (write_error_characters_as_such &&
- code >= UNICODE_ERROR_OCTET_RANGE_START &&
- code <= UNICODE_ERROR_OCTET_RANGE_END)
- {
- Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
- break;
- }
- else
- bytes = 5;
- }
+ else if (code <= 0x3ffffff) bytes = 5;
else bytes = 6;
Dynarr_add_many (dst, 0, bytes);
@@ -2657,9 +2605,9 @@
/* counter != 0 */
if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
{
- indicate_invalid_utf_8(indicated_length,
- counter,
- ch, dst, data, ignore_bom);
+ indicate_invalid_utf_8 (indicated_length,
+ counter,
+ ch, dst, data, ignore_bom);
if (c & 0x80)
{
DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
@@ -2685,12 +2633,13 @@
if ((ch < 0x80) ||
((ch < 0x800) && indicated_length > 2) ||
((ch < 0x10000) && indicated_length > 3) ||
- valid_utf_16_surrogate(ch) || (ch > 0x110000))
+ valid_utf_16_surrogate (ch) ||
+ (ch > UNICODE_OFFICIAL_MAX))
{
- indicate_invalid_utf_8(indicated_length,
- counter,
- ch, dst, data,
- ignore_bom);
+ indicate_invalid_utf_8 (indicated_length,
+ counter,
+ ch, dst, data,
+ ignore_bom);
}
else
{
@@ -2784,7 +2733,7 @@
counter += 8;
if (counter == 32)
{
- if (ch > 0x10ffff)
+ if (ch > UNICODE_OFFICIAL_MAX)
{
/* ch is not a legal Unicode character. We're fine
with that in UCS-4, though not in UTF-32. */
@@ -2796,24 +2745,24 @@
else if (little_endian)
{
DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
}
else
{
DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
- ignore_bom);
+ ignore_bom);
}
}
else
@@ -2886,7 +2835,7 @@
ignore_bom);
}
}
- else assert(0);
+ else assert (0);
break;
}
ch = 0;
diff -r 350284c64e7e -r d04006e313df src/unicode.h
--- a/src/unicode.h Sun Feb 28 04:34:19 2010 -0600
+++ b/src/unicode.h Sun Feb 28 04:47:31 2010 -0600
@@ -23,6 +23,10 @@
#define INCLUDED_unicode_h_
#ifdef MULE
+
+/************************************************************************/
+/* Precedence arrays */
+/************************************************************************/
struct precedence_array
{
@@ -51,4 +55,53 @@
#endif /* MULE */
+/************************************************************************/
+/* Unicode error octet characters */
+/************************************************************************/
+
+/* Where to place the 256 private Unicode codepoints used for encoding
+ erroneous octets in a UTF-8 or UTF-16 file. Note: This MUST be below
+ the space used for encoding unknown charset codepoints, which currently
+ starts at 0x800000. See charset_codepoint_to_private_unicode(). */
+#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
+#define UNICODE_ERROR_OCTET_RANGE_END (UNICODE_ERROR_OCTET_RANGE_START + 0xFF)
+
+DECLARE_INLINE_HEADER (
+int
+unicode_error_octet_code_p (int code)
+)
+{
+ return (code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code <= UNICODE_ERROR_OCTET_RANGE_END);
+}
+
+#define unicode_error_octet_code_to_octet(code) \
+ ((unsigned char) ((code) & 0xFF))
+
+/************************************************************************/
+/* UTF-16 properties */
+/************************************************************************/
+
+#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
+#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
+#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
+
+/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
+ algorithm.
+
+ (They also give another, really verbose one, as part of their explanation
+ of the various planes of the encoding, but we won't use that.) */
+
+#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
+#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
+
+#define utf_16_surrogates_to_code(lead, trail) \
+ (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
+
+#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \
+ int __ctu16s_code = (codepoint); \
+ lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \
+ trail = 0xDC00 + (__ctu16s_code & 0x3FF); \
+} while (0)
+
#endif /* INCLUDED_unicode_h_ */
More information about the XEmacs-Patches
mailing list