CVS update by aidan xemacs/tests/automated ...
xemacs-cvs at xemacs.org
xemacs-cvs at xemacs.org
Sun May 13 07:11:39 EDT 2007
User: aidan
Date: 07/05/13 13:11:39
Modified: xemacs/tests/automated mule-tests.el
Log:
Support non-BMP UTF-16.
Revision Changes Path
1.1060 +7 -0 XEmacs/xemacs/src/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/ChangeLog,v
retrieving revision 1.1059
retrieving revision 1.1060
diff -u -p -r1.1059 -r1.1060
--- ChangeLog 2007/05/12 10:59:15 1.1059
+++ ChangeLog 2007/05/13 11:11:28 1.1060
@@ -1,3 +1,10 @@
+2007-04-30 Aidan Kehoe <kehoea at parhasard.net>
+
+ * unicode.c:
+ * unicode.c (encode_unicode_char_1):
+ * unicode.c (unicode_convert):
+ Support non-BMP characters in UTF-16.
+
2007-05-12 Aidan Kehoe <kehoea at parhasard.net>
* event-Xt.c (x_reset_modifier_mapping):
1.37 +75 -4 XEmacs/xemacs/src/unicode.c
Index: unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.36
retrieving revision 1.37
diff -u -p -r1.36 -r1.37
--- unicode.c 2006/12/29 18:09:51 1.36
+++ unicode.c 2007/05/13 11:11:30 1.37
@@ -200,6 +200,28 @@ Lisp_Object Qutf_16_little_endian_bom;
Lisp_Object Qutf_8_bom;
+/* See the Unicode FAQ, http://www.unicode.org/faq/utf_bom.html#35 for this
+ algorithm.
+
+ (They also give another, really verbose one, as part of their explanation
+ of the various planes of the encoding, but we won't use that.) */
+
+#define UTF_16_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
+#define UTF_16_SURROGATE_OFFSET (0x10000 - (0xD800 << 10) - 0xDC00)
+
+#define utf_16_surrogates_to_code(lead, trail) \
+ (((lead) << 10) + (trail) + UTF_16_SURROGATE_OFFSET)
+
+#define CODE_TO_UTF_16_SURROGATES(codepoint, lead, trail) do { \
+ int __ctu16s_code = (codepoint); \
+ lead = UTF_16_LEAD_OFFSET + (__ctu16s_code >> 10); \
+ trail = 0xDC00 + (__ctu16s_code & 0x3FF); \
+} while (0)
+
+#define valid_utf_16_first_surrogate(ch) (((ch) & 0xFC00) == 0xD800)
+#define valid_utf_16_last_surrogate(ch) (((ch) & 0xFC00) == 0xDC00)
+#define valid_utf_16_surrogate(ch) (((ch) & 0xF800) == 0xD800)
+
#ifdef MULE
/* Using ints for to_unicode is OK (as long as they are >= 32 bits).
@@ -1742,13 +1764,39 @@ encode_unicode_char_1 (int code, unsigne
case UNICODE_UTF_16:
if (little_endian)
{
- Dynarr_add (dst, (unsigned char) (code & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ if (code < 0x10000) {
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ } else {
+ /* Little endian; least significant byte first. */
+ int first, second;
+
+ CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+ Dynarr_add (dst, (unsigned char) (first & 255));
+ Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+
+ Dynarr_add (dst, (unsigned char) (second & 255));
+ Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ }
}
else
{
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (code & 255));
+ if (code < 0x10000) {
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ } else {
+ /* Big endian; most significant byte first. */
+ int first, second;
+
+ CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+ Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (first & 255));
+
+ Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (second & 255));
+ }
}
break;
@@ -1919,17 +1967,40 @@ unicode_convert (struct coding_stream *s
break;
case UNICODE_UTF_16:
+
if (little_endian)
ch = (c << counter) | ch;
else
ch = (ch << 8) | c;
counter += 8;
+
+ if (counter == 16 && valid_utf_16_first_surrogate(ch))
+ break;
+
if (counter == 16)
{
int tempch = ch;
ch = 0;
counter = 0;
decode_unicode_char (tempch, dst, data, ignore_bom);
+ }
+ if (counter == 32)
+ {
+ int tempch;
+ /* #### Signalling an error may be a bit extreme. Should
+ we try and read it in anyway? */
+ if (!valid_utf_16_first_surrogate(ch >> 16)
+ || !valid_utf_16_last_surrogate(ch & 0xFFFF))
+ {
+ signal_error(Qtext_conversion_error,
+ "Invalid UTF-16 surrogate sequence",
+ Qunbound);
+ }
+ tempch = utf_16_surrogates_to_code((ch >> 16),
+ (ch & 0xffff));
+ ch = 0;
+ counter = 0;
+ decode_unicode_char(tempch, dst, data, ignore_bom);
}
break;
1.90 +5 -0 XEmacs/xemacs/tests/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/tests/ChangeLog,v
retrieving revision 1.89
retrieving revision 1.90
diff -u -p -r1.89 -r1.90
--- ChangeLog 2007/05/12 10:17:07 1.89
+++ ChangeLog 2007/05/13 11:11:37 1.90
@@ -1,3 +1,8 @@
+2007-04-30 Aidan Kehoe <kehoea at parhasard.net>
+
+ * automated/mule-tests.el (featurep):
+ Minimal tests of the non-BMP UTF-16 support.
+
2007-05-12 Aidan Kehoe <kehoea at parhasard.net>
* automated/mule-tests.el:
1.16 +9 -1 XEmacs/xemacs/tests/automated/mule-tests.el
Index: mule-tests.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/tests/automated/mule-tests.el,v
retrieving revision 1.15
retrieving revision 1.16
diff -u -p -r1.15 -r1.16
--- mule-tests.el 2007/05/12 10:17:09 1.15
+++ mule-tests.el 2007/05/13 11:11:38 1.16
@@ -341,9 +341,9 @@ This is a naive implementation in Lisp.
'utf-8
'iso-8859-2))
)
- ;; This is how you suppress output from `message', called by `write-region'
(Assert (not (equal name1 name2)))
(Assert (not (file-exists-p name1)))
+ ;; This is how you suppress output from `message', called by `write-region'
(Silence-Message
(write-region (point-min) (point-max) name1))
(Assert (file-exists-p name1))
@@ -401,6 +401,14 @@ This is a naive implementation in Lisp.
(Assert (equal (concat "\033%G" utf-8-char)
(encode-coding-string xemacs-character 'ctext))))))
+ (loop
+ for (code-point encoded)
+ in '((#x10000 "\xd8\x00\xdc\x00")
+ (#x10FFFD "\xdb\xff\xdf\xfd"))
+ do (Assert (equal (encode-coding-string
+ (decode-char 'ucs code-point) 'utf-16)
+ encoded)))
+
;;---------------------------------------------------------------
;; Regression test for a couple of CCL-related bugs.
;;---------------------------------------------------------------
More information about the XEmacs-CVS
mailing list