User: aidan
Date: 06/06/03 19:51:06
Modified: xemacs/tests/automated mule-tests.el
Log:
Just-in-time Unicode code point support.
Revision Changes Path
1.748 +18 -0 XEmacs/xemacs/lisp/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/ChangeLog,v
retrieving revision 1.747
retrieving revision 1.748
diff -u -p -r1.747 -r1.748
--- ChangeLog 2006/06/02 22:18:05 1.747
+++ ChangeLog 2006/06/03 17:50:39 1.748
@@ -1,3 +1,21 @@
+2006-06-03 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * mule/mule-ccl.el:
+ "X Emacs" -> "XEmacs"
+ * mule/mule-ccl.el (ccl-compile-mule-to-unicode): New.
+ * mule/mule-ccl.el (ccl-compile-unicode-to-mule): New.
+ * mule/mule-ccl.el (ccl-dump-mule-to-unicode): New.
+ * mule/mule-ccl.el (ccl-dump-unicode-to-mule): New.
+ * mule/mule-ccl.el (define-ccl-program):
+ Add two new CCL commands, and commands to describe them; document
+ them.
+
+2006-06-03 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * unicode.el:
+ * unicode.el (featurep):
+ * x-init.el (x-initialize-keyboard):
+
2006-06-03 Adrian Aichner <adrian(a)xemacs.org>
* package-get.el (package-get-download-sites): Welcome
1.13 +32 -0 XEmacs/xemacs/lisp/unicode.el
Index: unicode.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/unicode.el,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -p -r1.12 -r1.13
--- unicode.el 2005/03/04 21:59:43 1.12
+++ unicode.el 2006/06/03 17:50:40 1.13
@@ -318,6 +318,38 @@ The second argument must be 'ucs, the th
"Sorry, encode-char doesn't yet support anything but the UCS. ")
(char-to-unicode char))
+(when (featurep 'mule)
+ ;; This CCL program is used for displaying the fallback UCS character set,
+ ;; and can be repurposed to lao and the IPA, all going well.
+ ;;
+ ;; define-ccl-program is available after mule-ccl is loaded, much later
+ ;; than this file in the build process. The below is the result of
+ ;;
+ ;; (macroexpand
+ ;; '(define-ccl-program ccl-encode-to-ucs-2
+ ;; `(1
+ ;; ((r1 = (r1 << 8))
+ ;; (r1 = (r1 | r2))
+ ;; (mule-to-unicode r0 r1)
+ ;; (r1 = (r0 >> 8))
+ ;; (r2 = (r0 & 255))))
+ ;; "CCL program to transform Mule characters to UCS-2."))
+ ;;
+ ;; and it should occasionally be confirmed that the correspondence still
+ ;; holds.
+
+ (let ((prog [1 10 131127 8 98872 65823 147513 8 82009 255 22]))
+ (defconst ccl-encode-to-ucs-2 prog
+ "CCL program to transform Mule characters to UCS-2.")
+ (put (quote ccl-encode-to-ucs-2) (quote ccl-program-idx)
+ (register-ccl-program (quote ccl-encode-to-ucs-2) prog)) nil))
+
+;; Won't do this just yet, though.
+;; (set-charset-registry 'lao "iso10646-1")
+;; (set-charset-ccl-program 'lao 'ccl-encode-to-ucs-2)
+;; (set-charset-registry 'ipa "iso10646-1")
+;; (set-charset-ccl-program 'ipa 'ccl-encode-to-ucs-2)
+
;; #### UTF-7 is not yet implemented, and it's tricky to do. There's
;; an implementation in appendix A.1 of the Unicode Standard, Version
;; 2.0, but I don't know its licensing characteristics.
1.17 +1 -1 XEmacs/xemacs/lisp/x-init.el
Index: x-init.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/x-init.el,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -p -r1.16 -r1.17
--- x-init.el 2006/04/25 14:01:54 1.16
+++ x-init.el 2006/06/03 17:50:40 1.17
@@ -247,7 +247,7 @@
(when (and (not (get (intern sym-string) 'character-of-keysym))
(string-match "^U[0-9A-F]+$" sym-string))
(pushnew (concat sym-string " ") unknown-code-points :test 'equal)))
- (when unknown-code-points
+ (when (and (featurep 'mule) unknown-code-points)
(lwarn 'key-mapping 'info
"Undefined Unicode key mappings.
Your keyboard has, among many others, the following keysyms defined:
1.10 +46 -11 XEmacs/xemacs/lisp/mule/mule-ccl.el
Index: mule-ccl.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/mule/mule-ccl.el,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -p -r1.9 -r1.10
--- mule-ccl.el 2005/05/05 17:10:38 1.9
+++ mule-ccl.el 2006/06/03 17:50:45 1.10
@@ -5,20 +5,20 @@
;; Keywords: CCL, mule, multilingual, character set, coding-system
-;; This file is part of X Emacs.
+;; This file is part of XEmacs.
-;; GNU Emacs is free software; you can redistribute it and/or modify
+;; XEmacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
-;; GNU Emacs is distributed in the hope that it will be useful,
+;; XEmacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING. If not, write to the
+;; along with XEmacs; see the file COPYING. If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.
@@ -48,7 +48,7 @@
[if branch loop break repeat write-repeat write-read-repeat
read read-if read-branch write call end
read-multibyte-character write-multibyte-character
- translate-character
+ translate-character mule-to-unicode unicode-to-mule
iterate-multiple-map map-multiple map-single]
"Vector of CCL commands (symbols).")
@@ -100,7 +100,9 @@
write-multibyte-character
translate-character
translate-character-const-tbl
- nil nil nil nil nil nil nil nil nil nil nil nil ; 0x04-0x0f
+ mule-to-unicode
+ unicode-to-mule
+ nil nil nil nil nil nil nil nil nil nil ; 0x06-0x0f
iterate-multiple-map
map-multiple
map-single
@@ -830,6 +832,29 @@
(ccl-embed-extended-command 'translate-character rrr RRR Rrr))))
nil)
+;; Compile mule-to-unicode
+(defun ccl-compile-mule-to-unicode (cmd)
+ (if (/= (length cmd) 3)
+ (error "CCL: Invalid number of arguments: %s" cmd))
+ (let ((RRR (nth 1 cmd))
+ (rrr (nth 2 cmd)))
+ (ccl-check-register RRR cmd)
+ (ccl-check-register rrr cmd)
+ (ccl-embed-extended-command 'mule-to-unicode RRR rrr 0))
+ nil)
+
+;; Given a Unicode code point in register rrr, write the charset ID of the
+;; corresponding character in RRR, and the Mule-CCL form of its code in rrr.
+(defun ccl-compile-unicode-to-mule (cmd)
+ (if (/= (length cmd) 3)
+ (error "CCL: Invalid number of arguments: %s" cmd))
+ (let ((rrr (nth 1 cmd))
+ (RRR (nth 2 cmd)))
+ (ccl-check-register rrr cmd)
+ (ccl-check-register RRR cmd)
+ (ccl-embed-extended-command 'unicode-to-mule rrr RRR 0))
+ nil)
+
(defun ccl-compile-iterate-multiple-map (cmd)
(ccl-compile-multiple-map-function 'iterate-multiple-map cmd)
nil)
@@ -1188,6 +1213,12 @@
(let ((tbl (ccl-get-next-code)))
(insert (format "translation table(%S) r%d r%d\n" tbl RRR rrr))))
+(defun ccl-dump-mule-to-unicode (rrr RRR Rrr)
+ (insert (format "change chars in r%d and r%d to unicode\n" RRR rrr)))
+
+(defun ccl-dump-unicode-to-mule (rrr RRR Rrr)
+ (insert (format "converter UCS code %d to a Mule char\n" rrr)))
+
(defun ccl-dump-iterate-multiple-map (rrr RRR Rrr)
(let ((notbl (ccl-get-next-code))
(i 0) id)
@@ -1358,10 +1389,14 @@ WRITE :=
;; Call CCL program whose name is ccl-program-name.
CALL := (call ccl-program-name)
+
+TRANSLATE:= ;; Not implemented under XEmacs, except mule-to-unicode and
+ ;; unicode-to-mule.
+ (translate-character REG(table) REG(charset) REG(codepoint))
+ | (translate-character SYMBOL REG(charset) REG(codepoint))
+ | (mule-to-unicode REG(charset) REG(codepoint))
+ | (unicode-to-mule REG(unicode,code) REG(CHARSET))
-TRANSLATE:= ;; Not implemented under XEmacs.
- (translate-character REG(table) REG(charset) REG(codepoint))
- | (translate-character SYMBOL REG(charset) REG(codepoint))
MAP :=
(iterate-multiple-map REG REG MAP-IDs)
| (map-multiple REG REG (MAP-SET))
@@ -1373,8 +1408,8 @@ MAP-ID := INT-OR-CHAR
;; Terminate the CCL program.
END := (end)
-;; CCL registers. These can contain any integer value. As r7 is used by CCL
-;; interpreter itself, its value change unexpectedly.
+;; CCL registers. These can contain any integer value. As r7 is used by the
+;; CCL interpreter itself, its value can change unexpectedly.
REG := r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7
ARG := REG | INT-OR-CHAR
1.343 +7 -0 XEmacs/xemacs/man/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/ChangeLog,v
retrieving revision 1.342
retrieving revision 1.343
diff -u -p -r1.342 -r1.343
--- ChangeLog 2006/05/16 15:36:15 1.342
+++ ChangeLog 2006/06/03 17:50:46 1.343
@@ -1,3 +1,10 @@
+2006-06-03 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * lispref/mule.texi (CCL Syntax):
+ * lispref/mule.texi (CCL Statements):
+ Describe the mule-to-unicode and unicode-to-mule statements;
+ rename the section they are described in.
+
2006-05-17 Stephen J. Turnbull <stephen(a)xemacs.org>
* xemacs-faq.texi (Q2.2.3): New node.
1.15 +32 -2 XEmacs/xemacs/man/lispref/mule.texi
Index: mule.texi
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/lispref/mule.texi,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -p -r1.14 -r1.15
--- mule.texi 2005/06/19 20:49:47 1.14
+++ mule.texi 2006/06/03 17:50:48 1.15
@@ -1825,6 +1825,15 @@ WRITE :=
| (write INT-OR-CHAR) | (write string) | (write REG ARRAY)
| string
CALL := (call ccl-program-name)
+
+
+TRANSLATE:= ;; Not implemented under XEmacs, except mule-to-unicode and
+ ;; unicode-to-mule.
+ (translate-character REG(table) REG(charset) REG(codepoint))
+ | (translate-character SYMBOL REG(charset) REG(codepoint))
+ | (mule-to-unicode REG(charset) REG(codepoint))
+ | (unicode-to-mule REG(unicode,code) REG(CHARSET))
+
END := (end)
REG := r0 | r1 | r2 | r3 | r4 | r5 | r6 | r7
@@ -1845,7 +1854,8 @@ INT-OR-CHAR := integer | character
The Emacs Code Conversion Language provides the following statement
types: @dfn{set}, @dfn{if}, @dfn{branch}, @dfn{loop}, @dfn{repeat},
-@dfn{break}, @dfn{read}, @dfn{write}, @dfn{call}, and @dfn{end}.
+@dfn{break}, @dfn{read}, @dfn{write}, @dfn{call}, @dfn{translate} and
+@dfn{end}.
@heading Set statement:
@@ -1933,11 +1943,31 @@ reads from the input into the register,
@code{write} and @code{read} statements for the semantics of the I/O
operations for each type of argument.
-@heading Other control statements:
+@heading Other statements:
The @dfn{call} statement, written @samp{(call @var{ccl-program-name})},
executes a CCL program as a subroutine. It does not return a value to
the caller, but can modify the register status.
+
+ The @dfn{mule-to-unicode} statement translates an XEmacs character into a
+UCS code point, using U+FFFD REPLACEMENT CHARACTER if the given XEmacs
+character has no known corresponding code point. It takes two
+arguments; the first is a register in which is stored the character set
+ID of the character to be translated, and into which the UCS code is
+stored. The second is a register which stores the XEmacs code of the
+character in question; if it is from a multidimensional character set,
+like most of the East Asian national sets, it's stored as @samp{((c1 <<
+8) & c2)}, where @samp{c1} is the first code, and @samp{c2} the second.
+(That is, as a single integer, the high-order eight bits of which encode
+the first position code, and the low order bits of which encode the
+second.)
+
+ The @dfn{unicode-to-mule} statement translates a Unicode code point
+(an integer) into an XEmacs character. Its first argument is a register
+containing the UCS code point; the code for the correspond character
+will be written into this register, in the same format as for
+@samp{mule-to-unicode} The second argument is a register into which will
+be written the character set ID of the converted character.
The @dfn{end} statement, written @samp{(end)}, terminates the CCL
program successfully, and returns to caller (which may be a CCL
1.971 +87 -0 XEmacs/xemacs/src/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/ChangeLog,v
retrieving revision 1.970
retrieving revision 1.971
diff -u -p -r1.970 -r1.971
--- ChangeLog 2006/05/30 06:21:31 1.970
+++ ChangeLog 2006/06/03 17:50:51 1.971
@@ -1,3 +1,90 @@
+2006-06-03 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * charset.h:
+ * charset.h (struct Lisp_Charset):
+ * charset.h (CHARSET_ENCODE_AS_UTF_8):
+ * charset.h (XCHARSET_ENCODE_AS_UTF_8):
+ Add a flag `encode-as-utf-8' to the Mule charset structure; if
+ set, it's an indication to ISO 2022-oriented coding systems that
+ the characters of that charset should be encoded using the ISO-IR
+ 196 UTF-8 escape syntax, since they're not members of any other
+ well-known character set we're aware of.
+
+ Make enum unicode_type, encode_unicode_char and Funicode_to_char
+ available outside of unicode.c
+
+ * lread.c:
+ * event-xlike-inc.c:
+ Use the charset.h declaration of Funicode_to_char, don't declare
+ it ourselves.
+
+ My XFree86 installation has taken to passing me ASCII characters
+ using the Unicode keysyms; accept them too.
+
+ * general-slots.h:
+ Make `ccl-program' and `encode-as-utf-8' available as symbols
+ generally.
+
+ * mule-ccl.c:
+ Add CCL_MuleToUnicode, CCL_UnicodeToMule, implement them, enable
+ and debug CCL_MAKE_CHAR, have CCL_WriteMultibyteChar2 segfault
+ less, fix some grammar.
+
+ * mule-charset.c (make_charset):
+ * mule-charset.c (Fmake_charset):
+ * mule-charset.c (Fcharset_property):
+ * mule-charset.c (complex_vars_of_mule_charset):
+ Require the encode_as_utf_8 property when calling make_charset ();
+ accept it when creating a charset from Lisp in Fmake_charset.
+
+ * mule-coding.c:
+ * mule-coding.c (dynarr_add_2022_one_dimension):
+ * mule-coding.c (dynarr_add_2022_two_dimensions):
+ Add two convenience functions for iso2022_decode, to abstract out
+ writing UTF-8 a little.
+
+ * mule-coding.c (enum iso_esc_flag):
+ Add one more state to reflect the existence of the UTF-8 escape.
+
+ * mule-coding.c (struct iso2022_coding_stream):
+ Add a counter variable to the state to permit handling
+ variable-length UTF-8.
+
+ * mule-coding.c (parse_iso2022_esc):
+ Update the function to work with ISO_STATE_UTF_8; only the ESC % @
+ escape is processed in that state, everything else is ignored and
+ passed through by the error handler.
+
+ * mule-coding.c (iso2022_decode):
+ * mule-coding.c (iso2022_designate):
+ * mule-coding.c (iso2022_encode):
+ Handle the UTF-8 escape sequences in reading and in writing ISO
+ 2022.
+
+ * redisplay-x.c (separate_textual_runs):
+ Add a comment to the effect that the dimension stuff breaks when
+ using CCL programs and registries to map to a bigger charset.
+
+ * unicode.c:
+ Add support for creating new characters on the fly as unknown
+ Unicode code points are encountered.
+
+ * unicode.c (get_free_codepoint): New.
+ * unicode.c (unicode_to_ichar): Reworked to create new code points
+ on the fly.
+ * unicode.c (Funicode_to_char): Update the docstring.
+ * unicode.c (struct unicode_coding_system):
+ Move enum unicode_type into charset.h.
+
+ * unicode.c (encode_unicode_char):
+ encode_unicode_char isn't static any longer, mule-coding.c uses
+ it.
+ * unicode.c (syms_of_unicode):
+ Make a couple of symbols available to unicode.c
+ * unicode.c (vars_of_unicode):
+ Tell the garbage collector about current_jit_charset, initialise
+ it.
+
2006-05-25 Stephen J. Turnbull <stephen(a)xemacs.org>
* objects-x.c (x_find_charset_font): Keep local names local; wrap
1.13 +26 -0 XEmacs/xemacs/src/charset.h
Index: charset.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/charset.h,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -p -r1.12 -r1.13
--- charset.h 2005/10/24 10:07:34 1.12
+++ charset.h 2006/06/03 17:50:54 1.13
@@ -229,6 +229,11 @@ struct Lisp_Charset
/* Which half of font to be used to display this character set */
int graphic;
+ /* If set, this charset should be written out in ISO-2022-based coding
+ systems using the escape sequence for UTF-8, not using our internal
+ representation and the associated real ISO 2022 designation. */
+ unsigned int encode_as_utf_8 :1;
+
/* If set, this is a "temporary" charset created when we encounter
an unknown final. This is so that we can successfully compile
and load such files. We allow a real charset to be created on top
@@ -261,6 +266,7 @@ DECLARE_LRECORD (charset, Lisp_Charset);
#define CHARSET_REP_BYTES(cs) ((cs)->rep_bytes)
#define CHARSET_COLUMNS(cs) ((cs)->columns)
#define CHARSET_GRAPHIC(cs) ((cs)->graphic)
+#define CHARSET_ENCODE_AS_UTF_8(cs) ((cs)->encode_as_utf_8)
#define CHARSET_TYPE(cs) ((cs)->type)
#define CHARSET_DIRECTION(cs) ((cs)->direction)
#define CHARSET_FINAL(cs) ((cs)->final)
@@ -284,6 +290,7 @@ DECLARE_LRECORD (charset, Lisp_Charset);
#define XCHARSET_REP_BYTES(cs) CHARSET_REP_BYTES (XCHARSET (cs))
#define XCHARSET_COLUMNS(cs) CHARSET_COLUMNS (XCHARSET (cs))
#define XCHARSET_GRAPHIC(cs) CHARSET_GRAPHIC (XCHARSET (cs))
+#define XCHARSET_ENCODE_AS_UTF_8(cs) CHARSET_ENCODE_AS_UTF_8 (XCHARSET (cs))
#define XCHARSET_TYPE(cs) CHARSET_TYPE (XCHARSET (cs))
#define XCHARSET_DIRECTION(cs) CHARSET_DIRECTION (XCHARSET (cs))
#define XCHARSET_FINAL(cs) CHARSET_FINAL (XCHARSET (cs))
@@ -547,6 +554,25 @@ breakup_ichar_1 (Ichar c, Lisp_Object *c
void get_charset_limits (Lisp_Object charset, int *low, int *high);
int ichar_to_unicode (Ichar chr);
+EXFUN (Fcharset_name, 1);
+
#endif /* MULE */
+
+/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
+
+enum unicode_type
+{
+ UNICODE_UTF_16,
+ UNICODE_UTF_8,
+ UNICODE_UTF_7,
+ UNICODE_UCS_4
+};
+
+void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
+ int USED_IF_MULE (l), unsigned_char_dynarr *dst,
+ enum unicode_type type, unsigned int little_endian);
+
+EXFUN (Funicode_to_char, 2);
+EXFUN (Fchar_to_unicode, 1);
#endif /* INCLUDED_charset_h_ */
1.3 +2 -4 XEmacs/xemacs/src/event-xlike-inc.c
Index: event-xlike-inc.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/event-xlike-inc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -p -r1.2 -r1.3
--- event-xlike-inc.c 2005/06/26 18:05:04 1.2
+++ event-xlike-inc.c 2006/06/03 17:50:54 1.3
@@ -27,8 +27,6 @@ Boston, MA 02111-1307, USA. */
included here, not in event-xlike.c. However, event-xlike.c is always
X-specific, whereas the following code isn't, in the GTK case. */
-EXFUN (Funicode_to_char, 2); /* In unicode.c. */
-
static int
#ifdef THIS_IS_GTK
emacs_gtk_event_pending_p (int how_many)
@@ -503,9 +501,9 @@ gtk_keysym_to_character(guint keysym)
and only those should correspond directly to Unicode code points, in
the range #x100-#x10FFFF; actual implementations can have the Latin 1
code points do the same thing with keysyms
- #x010000A0-#x01000100. */
+ #x01000000-#x01000100. */
- if (keysym >= 0x010000A0 && keysym <= 0x0110FFFF)
+ if (keysym >= 0x01000000 && keysym <= 0x0110FFFF)
return Funicode_to_char (make_int(keysym & 0xffffff), Qnil);
if ((keysym & 0xff) < 0xa0)
1.17 +2 -0 XEmacs/xemacs/src/general-slots.h
Index: general-slots.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/general-slots.h,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -p -r1.16 -r1.17
--- general-slots.h 2005/07/03 21:48:01 1.16
+++ general-slots.h 2006/06/03 17:50:54 1.17
@@ -73,6 +73,7 @@ SYMBOL_KEYWORD (Q_callback);
SYMBOL_KEYWORD (Q_callback_ex);
SYMBOL (Qcancel);
SYMBOL (Qcategory);
+SYMBOL (Qccl_program);
SYMBOL (Qcenter);
SYMBOL (Qchain);
SYMBOL (Qchange);
@@ -115,6 +116,7 @@ SYMBOL (Qduplex);
SYMBOL (Qdynarr_overhead);
SYMBOL (Qemergency);
SYMBOL (Qempty);
+SYMBOL (Qencode_as_utf_8);
SYMBOL (Qeq);
SYMBOL (Qeql);
SYMBOL (Qequal);
1.78 +1 -2 XEmacs/xemacs/src/lread.c
Index: lread.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/lread.c,v
retrieving revision 1.77
retrieving revision 1.78
diff -u -p -r1.77 -r1.78
--- lread.c 2006/04/29 14:36:57 1.77
+++ lread.c 2006/06/03 17:50:54 1.78
@@ -34,6 +34,7 @@ Boston, MA 02111-1307, USA. */
#include "lstream.h"
#include "opaque.h"
#include "profile.h"
+#include "charset.h" /* For Funicode_to_char. */
#include "sysfile.h"
#include "sysfloat.h"
@@ -207,8 +208,6 @@ static int saved_doc_string_position;
static int locate_file_open_or_access_file (Ibyte *fn, int access_mode);
EXFUN (Fread_from_string, 3);
-
-EXFUN (Funicode_to_char, 2); /* In unicode.c. */
/* When errors are signaled, the actual readcharfun should not be used
as an argument if it is an lstream, so that lstreams don't escape
1.29 +92 -18 XEmacs/xemacs/src/mule-ccl.c
Index: mule-ccl.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mule-ccl.c,v
retrieving revision 1.28
retrieving revision 1.29
diff -u -p -r1.28 -r1.29
--- mule-ccl.c 2005/06/26 19:05:07 1.28
+++ mule-ccl.c 2006/06/03 17:50:54 1.29
@@ -461,7 +461,17 @@ Lisp_Object Vccl_program_table;
1:ExtendedCOMMNDRrrRRRrrrXXXXX
2:ARGUMENT(Translation Table ID)
*/
+/* Translate a character whose code point is reg[rrr] and charset ID is
+ reg[RRR], into its Unicode code point, which will be written into
+ reg[rrr]. */
+#define CCL_MuleToUnicode 0x04
+
+/* Translate a Unicode code point, in reg[rrr], into a Mule character,
+ writing the charset ID into reg[RRR] and the code point into reg[Rrr]. */
+
+#define CCL_UnicodeToMule 0x05
+
/* Iterate looking up MAPs for reg[rrr] starting from the Nth (N =
reg[RRR]) MAP until some value is found.
@@ -577,7 +587,6 @@ Lisp_Object Vccl_program_table;
...
N:SEPARATOR_z (< 0)
*/
-
#define MAX_MAP_SET_LEVEL 30
typedef struct
@@ -837,26 +846,41 @@ static int stack_idx_of_map_multiple;
CODE to that invalid byte. */
/* On XEmacs, TranslateCharacter is not supported. Thus, this
- macro is not used. */
-#if 0
+ macro is only used in the MuleToUnicode transformation. */
#define CCL_MAKE_CHAR(charset, code, c) \
do { \
- if ((charset) == CHARSET_ASCII) \
- (c) = (code) & 0xFF; \
- else if (CHARSET_DEFINED_P (charset) \
- && ((code) & 0x7F) >= 32 \
- && ((code) < 256 || ((code >> 7) & 0x7F) >= 32)) \
+ if ((charset) == LEADING_BYTE_ASCII) \
{ \
- int c1 = (code) & 0x7F, c2 = 0; \
+ c = (code) & 0xFF; \
+ } \
+ else if ((charset) == LEADING_BYTE_CONTROL_1) \
+ { \
+ c = ((code) & 0xFF) - 0xA0; \
+ } \
+ else if (!NILP(charset_by_leading_byte(charset)) \
+ && ((code) >= 32) \
+ && ((code) < 256 || ((code >> 8) & 0x7F) >= 32)) \
+ { \
+ int c1, c2 = 0; \
\
- if ((code) >= 256) \
- c2 = c1, c1 = ((code) >> 7) & 0x7F; \
- (c) = make_ichar (charset, c1, c2); \
+ if ((code) < 256) \
+ { \
+ c1 = (code) & 0x7F; \
+ c2 = 0; \
+ } \
+ else \
+ { \
+ c1 = ((code) >> 8) & 0x7F; \
+ c2 = (code) & 0x7F; \
+ } \
+ c = make_ichar (charset_by_leading_byte(charset), \
+ c1, c2); \
} \
else \
- (c) = (code) & 0xFF; \
- } while (0)
-#endif
+ { \
+ c = (code) & 0xFF; \
+ } \
+ } while (0)
/* Execute CCL code on SRC_BYTES length text at SOURCE. The resulting
@@ -1392,9 +1416,9 @@ ccl_driver (struct ccl_program *ccl,
case CCL_TranslateCharacter:
#if 0
- /* XEmacs does not have translate_char, and its
- equivalent nor. We do nothing on this operation. */
- CCL_MAKE_CHAR (reg[RRR], reg[rrr], i);
+ /* XEmacs does not have translate_char, nor an
+ equivalent. We do nothing on this operation. */
+ CCL_MAKE_CHAR(reg[RRR], reg[rrr], op);
op = translate_char (GET_TRANSLATION_TABLE (reg[Rrr]),
i, -1, 0, 0);
SPLIT_CHAR (op, reg[RRR], i, j);
@@ -1420,6 +1444,56 @@ ccl_driver (struct ccl_program *ccl,
reg[rrr] = i;
#endif
break;
+
+ case CCL_MuleToUnicode:
+ {
+ Lisp_Object ucs;
+
+ CCL_MAKE_CHAR(reg[rrr], reg[RRR], op);
+ ucs = Fchar_to_unicode(make_char(op));
+
+ if (NILP(ucs))
+ {
+ /* Uhh, char-to-unicode doesn't return nil at the
+ moment, only ever -1. */
+ reg[rrr] = 0xFFFD; /* REPLACEMENT CHARACTER */
+ }
+ else
+ {
+ reg[rrr] = XINT(ucs);
+ if (-1 == reg[rrr])
+ {
+ reg[rrr] = 0xFFFD; /* REPLACEMENT CHARACTER */
+ }
+ }
+ break;
+ }
+
+ case CCL_UnicodeToMule:
+ {
+ Lisp_Object scratch;
+
+ scratch = Funicode_to_char(make_int(reg[rrr]), Qnil);
+
+ if (!NILP(scratch))
+ {
+ op = XCHAR(scratch);
+ BREAKUP_ICHAR (op, scratch, i, j);
+ reg[RRR] = XCHARSET_ID(scratch);
+
+ if (j != 0)
+ {
+ i = (i << 8) | j;
+ }
+
+ reg[rrr] = i;
+ }
+ else
+ {
+ reg[rrr] = reg[RRR] = 0;
+ }
+ break;
+ }
case CCL_IterateMultipleMap:
{
1.47 +47 -31 XEmacs/xemacs/src/mule-charset.c
Index: mule-charset.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mule-charset.c,v
retrieving revision 1.46
retrieving revision 1.47
diff -u -p -r1.46 -r1.47
--- mule-charset.c 2005/10/25 11:16:26 1.46
+++ mule-charset.c 2006/06/03 17:50:54 1.47
@@ -190,7 +190,7 @@ make_charset (int id, Lisp_Object name,
int type, int columns, int graphic,
Ibyte final, int direction, Lisp_Object short_name,
Lisp_Object long_name, Lisp_Object doc,
- Lisp_Object reg, int overwrite)
+ Lisp_Object reg, int overwrite, int encode_as_utf_8)
{
Lisp_Object obj;
Lisp_Charset *cs;
@@ -240,6 +240,7 @@ make_charset (int id, Lisp_Object name,
CHARSET_FINAL (cs) = final;
CHARSET_DOC_STRING (cs) = doc;
CHARSET_REGISTRY (cs) = reg;
+ CHARSET_ENCODE_AS_UTF_8 (cs) = encode_as_utf_8 ? 1 : 0;
CHARSET_CCL_PROGRAM (cs) = Qnil;
CHARSET_REVERSE_DIRECTION_CHARSET (cs) = Qnil;
@@ -454,6 +455,12 @@ character set. Recognized properties ar
is passed the octets of the character, with the high
bit cleared and set depending upon whether the value
of the `graphic' property is 0 or 1.
+`encode-as-utf-8'
+ If non-nil, the charset will be written out using the UTF-8
+ escape syntax in ISO 2022-oriented coding systems. Used for
+ supporting characters we know are part of Unicode but not of
+ any other known character set in escape-quoted and compound
+ text.
*/
(name, doc_string, props))
{
@@ -465,6 +472,7 @@ character set. Recognized properties ar
Lisp_Object charset = Qnil;
Lisp_Object ccl_program = Qnil;
Lisp_Object short_name = Qnil, long_name = Qnil;
+ int encode_as_utf_8 = 0;
Lisp_Object existing_charset;
int temporary = UNBOUNDP (name);
@@ -546,6 +554,11 @@ character set. Recognized properties ar
invalid_constant ("Invalid value for `direction'", value);
}
+ else if (EQ (keyword, Qencode_as_utf_8))
+ {
+ encode_as_utf_8 = NILP (value) ? 0 : 1;
+ }
+
else if (EQ (keyword, Qfinal))
{
CHECK_CHAR_COERCE_INT (value);
@@ -553,7 +566,6 @@ character set. Recognized properties ar
if (final < '0' || final > '~')
invalid_constant ("Invalid value for `final'", value);
}
-
else if (EQ (keyword, Qccl_program))
{
struct ccl_program test_ccl;
@@ -612,7 +624,8 @@ character set. Recognized properties ar
charset = make_charset (id, name, dimension + 2, type, columns, graphic,
final, direction, short_name, long_name,
- doc_string, registry, !NILP (existing_charset));
+ doc_string, registry, !NILP (existing_charset),
+ encode_as_utf_8);
XCHARSET (charset)->temporary = temporary;
if (!NILP (ccl_program))
@@ -641,7 +654,7 @@ NEW-NAME is the name of the new charset.
(charset, new_name))
{
Lisp_Object new_charset = Qnil;
- int id, dimension, columns, graphic;
+ int id, dimension, columns, graphic, encode_as_utf_8;
Ibyte final;
int direction, type;
Lisp_Object registry, doc_string, short_name, long_name;
@@ -672,10 +685,11 @@ NEW-NAME is the name of the new charset.
short_name = CHARSET_SHORT_NAME (cs);
long_name = CHARSET_LONG_NAME (cs);
registry = CHARSET_REGISTRY (cs);
+ encode_as_utf_8 = CHARSET_ENCODE_AS_UTF_8 (cs);
new_charset = make_charset (id, new_name, dimension + 2, type, columns,
graphic, final, direction, short_name, long_name,
- doc_string, registry, 0);
+ doc_string, registry, 0, encode_as_utf_8);
CHARSET_REVERSE_DIRECTION_CHARSET (cs) = new_charset;
XCHARSET_REVERSE_DIRECTION_CHARSET (new_charset) = charset;
@@ -807,6 +821,8 @@ Recognized properties are those listed i
if (EQ (prop, Qfinal)) return make_char (CHARSET_FINAL (cs));
if (EQ (prop, Qchars)) return make_int (CHARSET_CHARS (cs));
if (EQ (prop, Qregistry)) return CHARSET_REGISTRY (cs);
+ if (EQ (prop, Qencode_as_utf_8))
+ return CHARSET_ENCODE_AS_UTF_8 (cs) ? Qt : Qnil;
if (EQ (prop, Qccl_program)) return CHARSET_CCL_PROGRAM (cs);
if (EQ (prop, Qdirection))
return CHARSET_DIRECTION (cs) == CHARSET_LEFT_TO_RIGHT ? Ql2r : Qr2l;
@@ -1040,7 +1056,7 @@ complex_vars_of_mule_charset (void)
build_string ("ASCII"),
build_msg_string ("ASCII"),
build_msg_string ("ASCII (ISO646 IRV)"),
- build_string ("\\(iso8859-[0-9]*\\|-ascii\\)"), 0);
+ build_string ("\\(iso8859-[0-9]*\\|-ascii\\)"), 0, 0);
staticpro (&Vcharset_control_1);
Vcharset_control_1 =
make_charset (LEADING_BYTE_CONTROL_1, Qcontrol_1, 2,
@@ -1049,7 +1065,7 @@ complex_vars_of_mule_charset (void)
build_string ("C1"),
build_msg_string ("Control characters"),
build_msg_string ("Control characters 128-191"),
- build_string (""), 0);
+ build_string (""), 0, 0);
staticpro (&Vcharset_latin_iso8859_1);
Vcharset_latin_iso8859_1 =
make_charset (LEADING_BYTE_LATIN_ISO8859_1, Qlatin_iso8859_1, 2,
@@ -1058,7 +1074,7 @@ complex_vars_of_mule_charset (void)
build_string ("Latin-1"),
build_msg_string ("ISO8859-1 (Latin-1)"),
build_msg_string ("ISO8859-1 (Latin-1)"),
- build_string ("iso8859-1"), 0);
+ build_string ("iso8859-1"), 0, 0);
staticpro (&Vcharset_latin_iso8859_2);
Vcharset_latin_iso8859_2 =
make_charset (LEADING_BYTE_LATIN_ISO8859_2, Qlatin_iso8859_2, 2,
@@ -1067,7 +1083,7 @@ complex_vars_of_mule_charset (void)
build_string ("Latin-2"),
build_msg_string ("ISO8859-2 (Latin-2)"),
build_msg_string ("ISO8859-2 (Latin-2)"),
- build_string ("iso8859-2"), 0);
+ build_string ("iso8859-2"), 0, 0);
staticpro (&Vcharset_latin_iso8859_3);
Vcharset_latin_iso8859_3 =
make_charset (LEADING_BYTE_LATIN_ISO8859_3, Qlatin_iso8859_3, 2,
@@ -1076,7 +1092,7 @@ complex_vars_of_mule_charset (void)
build_string ("Latin-3"),
build_msg_string ("ISO8859-3 (Latin-3)"),
build_msg_string ("ISO8859-3 (Latin-3)"),
- build_string ("iso8859-3"), 0);
+ build_string ("iso8859-3"), 0, 0);
staticpro (&Vcharset_latin_iso8859_4);
Vcharset_latin_iso8859_4 =
make_charset (LEADING_BYTE_LATIN_ISO8859_4, Qlatin_iso8859_4, 2,
@@ -1085,7 +1101,7 @@ complex_vars_of_mule_charset (void)
build_string ("Latin-4"),
build_msg_string ("ISO8859-4 (Latin-4)"),
build_msg_string ("ISO8859-4 (Latin-4)"),
- build_string ("iso8859-4"), 0);
+ build_string ("iso8859-4"), 0, 0);
staticpro (&Vcharset_thai_tis620);
Vcharset_thai_tis620 =
make_charset (LEADING_BYTE_THAI_TIS620, Qthai_tis620, 2,
@@ -1094,7 +1110,7 @@ complex_vars_of_mule_charset (void)
build_string ("TIS620"),
build_msg_string ("TIS620 (Thai)"),
build_msg_string ("TIS620.2529 (Thai)"),
- build_string ("tis620"),0);
+ build_string ("tis620"), 0, 0);
staticpro (&Vcharset_greek_iso8859_7);
Vcharset_greek_iso8859_7 =
make_charset (LEADING_BYTE_GREEK_ISO8859_7, Qgreek_iso8859_7, 2,
@@ -1103,7 +1119,7 @@ complex_vars_of_mule_charset (void)
build_string ("ISO8859-7"),
build_msg_string ("ISO8859-7 (Greek)"),
build_msg_string ("ISO8859-7 (Greek)"),
- build_string ("iso8859-7"), 0);
+ build_string ("iso8859-7"), 0, 0);
staticpro (&Vcharset_arabic_iso8859_6);
Vcharset_arabic_iso8859_6 =
make_charset (LEADING_BYTE_ARABIC_ISO8859_6, Qarabic_iso8859_6, 2,
@@ -1112,7 +1128,7 @@ complex_vars_of_mule_charset (void)
build_string ("ISO8859-6"),
build_msg_string ("ISO8859-6 (Arabic)"),
build_msg_string ("ISO8859-6 (Arabic)"),
- build_string ("iso8859-6"), 0);
+ build_string ("iso8859-6"), 0, 0);
staticpro (&Vcharset_hebrew_iso8859_8);
Vcharset_hebrew_iso8859_8 =
make_charset (LEADING_BYTE_HEBREW_ISO8859_8, Qhebrew_iso8859_8, 2,
@@ -1121,7 +1137,7 @@ complex_vars_of_mule_charset (void)
build_string ("ISO8859-8"),
build_msg_string ("ISO8859-8 (Hebrew)"),
build_msg_string ("ISO8859-8 (Hebrew)"),
- build_string ("iso8859-8"), 0);
+ build_string ("iso8859-8"), 0, 0);
staticpro (&Vcharset_katakana_jisx0201);
Vcharset_katakana_jisx0201 =
make_charset (LEADING_BYTE_KATAKANA_JISX0201, Qkatakana_jisx0201, 2,
@@ -1130,7 +1146,7 @@ complex_vars_of_mule_charset (void)
build_string ("JISX0201 Kana"),
build_msg_string ("JISX0201.1976 (Japanese Kana)"),
build_msg_string ("JISX0201.1976 Japanese Kana"),
- build_string ("jisx0201.1976"), 0);
+ build_string ("jisx0201.1976"), 0, 0);
staticpro (&Vcharset_latin_jisx0201);
Vcharset_latin_jisx0201 =
make_charset (LEADING_BYTE_LATIN_JISX0201, Qlatin_jisx0201, 2,
@@ -1139,7 +1155,7 @@ complex_vars_of_mule_charset (void)
build_string ("JISX0201 Roman"),
build_msg_string ("JISX0201.1976 (Japanese Roman)"),
build_msg_string ("JISX0201.1976 Japanese Roman"),
- build_string ("jisx0201.1976"), 0);
+ build_string ("jisx0201.1976"), 0, 0);
staticpro (&Vcharset_cyrillic_iso8859_5);
Vcharset_cyrillic_iso8859_5 =
make_charset (LEADING_BYTE_CYRILLIC_ISO8859_5, Qcyrillic_iso8859_5, 2,
@@ -1148,7 +1164,7 @@ complex_vars_of_mule_charset (void)
build_string ("ISO8859-5"),
build_msg_string ("ISO8859-5 (Cyrillic)"),
build_msg_string ("ISO8859-5 (Cyrillic)"),
- build_string ("iso8859-5"), 0);
+ build_string ("iso8859-5"), 0, 0);
staticpro (&Vcharset_latin_iso8859_9);
Vcharset_latin_iso8859_9 =
make_charset (LEADING_BYTE_LATIN_ISO8859_9, Qlatin_iso8859_9, 2,
@@ -1157,7 +1173,7 @@ complex_vars_of_mule_charset (void)
build_string ("Latin-5"),
build_msg_string ("ISO8859-9 (Latin-5)"),
build_msg_string ("ISO8859-9 (Latin-5)"),
- build_string ("iso8859-9"), 0);
+ build_string ("iso8859-9"), 0, 0);
staticpro (&Vcharset_latin_iso8859_15);
Vcharset_latin_iso8859_15 =
make_charset (LEADING_BYTE_LATIN_ISO8859_15, Qlatin_iso8859_15, 2,
@@ -1166,7 +1182,7 @@ complex_vars_of_mule_charset (void)
build_string ("Latin-9"),
build_msg_string ("ISO8859-15 (Latin-9)"),
build_msg_string ("ISO8859-15 (Latin-9)"),
- build_string ("iso8859-15"), 0);
+ build_string ("iso8859-15"), 0, 0);
staticpro (&Vcharset_japanese_jisx0208_1978);
Vcharset_japanese_jisx0208_1978 =
make_charset (LEADING_BYTE_JAPANESE_JISX0208_1978, Qjapanese_jisx0208_1978, 3,
@@ -1176,7 +1192,7 @@ complex_vars_of_mule_charset (void)
build_msg_string ("JISX0208.1978 (Japanese)"),
build_msg_string
("JISX0208.1978 Japanese Kanji (so called \"old JIS\")"),
- build_string ("\\(jisx0208\\|jisc6226\\)\\.1978"), 0);
+ build_string ("\\(jisx0208\\|jisc6226\\)\\.1978"), 0, 0);
staticpro (&Vcharset_chinese_gb2312);
Vcharset_chinese_gb2312 =
make_charset (LEADING_BYTE_CHINESE_GB2312, Qchinese_gb2312, 3,
@@ -1185,7 +1201,7 @@ complex_vars_of_mule_charset (void)
build_string ("GB2312"),
build_msg_string ("GB2312)"),
build_msg_string ("GB2312 Chinese simplified"),
- build_string ("gb2312"), 0);
+ build_string ("gb2312"), 0, 0);
staticpro (&Vcharset_japanese_jisx0208);
Vcharset_japanese_jisx0208 =
make_charset (LEADING_BYTE_JAPANESE_JISX0208, Qjapanese_jisx0208, 3,
@@ -1194,7 +1210,7 @@ complex_vars_of_mule_charset (void)
build_string ("JISX0208"),
build_msg_string ("JISX0208.1983/1990 (Japanese)"),
build_msg_string ("JISX0208.1983/1990 Japanese Kanji"),
- build_string ("jisx0208.19\\(83\\|90\\)"), 0);
+ build_string ("jisx0208.19\\(83\\|90\\)"), 0, 0);
staticpro (&Vcharset_korean_ksc5601);
Vcharset_korean_ksc5601 =
make_charset (LEADING_BYTE_KOREAN_KSC5601, Qkorean_ksc5601, 3,
@@ -1203,7 +1219,7 @@ complex_vars_of_mule_charset (void)
build_string ("KSC5601"),
build_msg_string ("KSC5601 (Korean"),
build_msg_string ("KSC5601 Korean Hangul and Hanja"),
- build_string ("ksc5601"), 0);
+ build_string ("ksc5601"), 0, 0);
staticpro (&Vcharset_japanese_jisx0212);
Vcharset_japanese_jisx0212 =
make_charset (LEADING_BYTE_JAPANESE_JISX0212, Qjapanese_jisx0212, 3,
@@ -1212,7 +1228,7 @@ complex_vars_of_mule_charset (void)
build_string ("JISX0212"),
build_msg_string ("JISX0212 (Japanese)"),
build_msg_string ("JISX0212 Japanese Supplement"),
- build_string ("jisx0212"), 0);
+ build_string ("jisx0212"), 0, 0);
#define CHINESE_CNS_PLANE_RE(n) "cns11643[.-]\\(.*[.-]\\)?" n "$"
staticpro (&Vcharset_chinese_cns11643_1);
@@ -1224,7 +1240,7 @@ complex_vars_of_mule_charset (void)
build_msg_string ("CNS11643-1 (Chinese traditional)"),
build_msg_string
("CNS 11643 Plane 1 Chinese traditional"),
- build_string (CHINESE_CNS_PLANE_RE("1")), 0);
+ build_string (CHINESE_CNS_PLANE_RE("1")), 0, 0);
staticpro (&Vcharset_chinese_cns11643_2);
Vcharset_chinese_cns11643_2 =
make_charset (LEADING_BYTE_CHINESE_CNS11643_2, Qchinese_cns11643_2, 3,
@@ -1234,7 +1250,7 @@ complex_vars_of_mule_charset (void)
build_msg_string ("CNS11643-2 (Chinese traditional)"),
build_msg_string
("CNS 11643 Plane 2 Chinese traditional"),
- build_string (CHINESE_CNS_PLANE_RE("2")), 0);
+ build_string (CHINESE_CNS_PLANE_RE("2")), 0, 0);
staticpro (&Vcharset_chinese_big5_1);
Vcharset_chinese_big5_1 =
make_charset (LEADING_BYTE_CHINESE_BIG5_1, Qchinese_big5_1, 3,
@@ -1244,7 +1260,7 @@ complex_vars_of_mule_charset (void)
build_msg_string ("Big5 (Level-1)"),
build_msg_string
("Big5 Level-1 Chinese traditional"),
- build_string ("big5"), 0);
+ build_string ("big5"), 0, 0);
staticpro (&Vcharset_chinese_big5_2);
Vcharset_chinese_big5_2 =
make_charset (LEADING_BYTE_CHINESE_BIG5_2, Qchinese_big5_2, 3,
@@ -1254,7 +1270,7 @@ complex_vars_of_mule_charset (void)
build_msg_string ("Big5 (Level-2)"),
build_msg_string
("Big5 Level-2 Chinese traditional"),
- build_string ("big5"), 0);
+ build_string ("big5"), 0, 0);
#ifdef ENABLE_COMPOSITE_CHARS
@@ -1269,7 +1285,7 @@ complex_vars_of_mule_charset (void)
build_string ("Composite"),
build_msg_string ("Composite characters"),
build_msg_string ("Composite characters"),
- build_string (""), 0);
+ build_string (""), 0, 0);
#else
/* We create a hack so that we have a way of storing ESC 0 and ESC 1
sequences as "characters", so that they will be output correctly. */
@@ -1281,6 +1297,6 @@ complex_vars_of_mule_charset (void)
build_string ("Composite hack"),
build_msg_string ("Composite characters hack"),
build_msg_string ("Composite characters hack"),
- build_string (""), 0);
+ build_string (""), 0, 0);
#endif /* ENABLE_COMPOSITE_CHARS */
}
1.38 +213 -11 XEmacs/xemacs/src/mule-coding.c
Index: mule-coding.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mule-coding.c,v
retrieving revision 1.37
retrieving revision 1.38
diff -u -p -r1.37 -r1.38
--- mule-coding.c 2006/05/11 08:58:01 1.37
+++ mule-coding.c 2006/06/03 17:50:55 1.38
@@ -96,6 +96,42 @@ byte_shift_jis_katakana_p (int c)
return c >= 0xA1 && c <= 0xDF;
}
+inline static void
+dynarr_add_2022_one_dimension (Lisp_Object charset, Ibyte c,
+ unsigned char charmask,
+ unsigned_char_dynarr *dst)
+{
+ if (XCHARSET_ENCODE_AS_UTF_8 (charset))
+ {
+ encode_unicode_char (charset, c & charmask, 0,
+ dst, UNICODE_UTF_8, 0);
+ }
+ else
+ {
+ Dynarr_add (dst, c & charmask);
+ }
+}
+
+inline static void
+dynarr_add_2022_two_dimensions (Lisp_Object charset, Ibyte c,
+ unsigned int ch,
+ unsigned char charmask,
+ unsigned_char_dynarr *dst)
+{
+ if (XCHARSET_ENCODE_AS_UTF_8 (charset))
+ {
+ encode_unicode_char (charset,
+ ch & charmask,
+ c & charmask, dst,
+ UNICODE_UTF_8, 0);
+ }
+ else
+ {
+ Dynarr_add (dst, ch & charmask);
+ Dynarr_add (dst, c & charmask);
+ }
+}
+
/* Convert Shift-JIS data to internal format. */
static Bytecount
@@ -671,6 +707,10 @@ enum iso_esc_flag
ISO_ESC_2_4, /* We've seen ESC $. This indicates
that we're designating a multi-byte, rather
than a single-byte, character set. */
+ ISO_ESC_2_5, /* We've seen ESC %. This indicates an escape to a
+ Unicode coding system; the only one of these
+ we're prepared to deal with is UTF-8, which has
+ the next character as G. */
ISO_ESC_2_8, /* We've seen ESC 0x28, i.e. ESC (.
This means designate a 94-character
character set into G0. */
@@ -752,11 +792,15 @@ enum iso_error
character constructed by overstriking two or more characters). */
#define ISO_STATE_COMPOSITE (1 << 5)
+/* If set, we're processing UTF-8 encoded data within ISO-2022
+ processing. */
+#define ISO_STATE_UTF_8 (1 << 6)
+
/* ISO_STATE_LOCK is the mask of flags that remain on until explicitly
turned off when in the ISO2022 encoder/decoder. Other flags are turned
off at the end of processing each character or escape sequence. */
# define ISO_STATE_LOCK \
- (ISO_STATE_COMPOSITE | ISO_STATE_R2L)
+ (ISO_STATE_COMPOSITE | ISO_STATE_R2L | ISO_STATE_UTF_8)
typedef struct charset_conversion_spec
{
@@ -922,6 +966,9 @@ struct iso2022_coding_stream
Lisp_Object current_charset;
int current_half;
int current_char_boundary;
+
+ /* Used for handling UTF-8. */
+ unsigned char counter;
};
static const struct memory_description ccs_description_1[] =
@@ -1344,6 +1391,15 @@ parse_iso2022_esc (Lisp_Object codesys,
}
case ISO_ESC:
+
+ /* The only available ISO 2022 sequence in UTF-8 mode is ESC % @, to
+ exit from it. If we see any other escape sequence, pass it through
+ in the error handler. */
+ if (*flags & ISO_STATE_UTF_8 && '%' != c)
+ {
+ return 0;
+ }
+
switch (c)
{
/**** single shift ****/
@@ -1411,6 +1467,10 @@ parse_iso2022_esc (Lisp_Object codesys,
iso->esc = ISO_ESC_2_4;
goto not_done;
+ case '%': /* Prefix to an escape to or from Unicode. */
+ iso->esc = ISO_ESC_2_5;
+ goto not_done;
+
default:
if (0x28 <= c && c <= 0x2F)
{
@@ -1432,9 +1492,31 @@ parse_iso2022_esc (Lisp_Object codesys,
/* bzzzt! */
goto error;
}
-
-
+ /* ISO-IR 196 UTF-8 support. */
+ case ISO_ESC_2_5:
+ if ('G' == c)
+ {
+ /* Activate UTF-8 mode. */
+ *flags &= ISO_STATE_LOCK;
+ *flags |= ISO_STATE_UTF_8;
+ iso->esc = ISO_ESC_NOTHING;
+ return 1;
+ }
+ else if ('@' == c)
+ {
+ /* Deactive UTF-8 mode. */
+ *flags &= ISO_STATE_LOCK;
+ *flags &= ~(ISO_STATE_UTF_8);
+ iso->esc = ISO_ESC_NOTHING;
+ return 1;
+ }
+ else
+ {
+ /* Oops, we don't support the other UTF-? coding systems within
+ ISO 2022, only in their own context. */
+ goto error;
+ }
/**** directionality ****/
case ISO_ESC_5_11: /* ISO6429 direction control */
@@ -1822,6 +1904,87 @@ iso2022_decode (struct coding_stream *st
}
ch = 0;
}
+ else if (flags & ISO_STATE_UTF_8)
+ {
+ unsigned char counter = data->counter;
+ Ibyte work[MAX_ICHAR_LEN];
+ int len;
+ Lisp_Object chr;
+
+ if (ISO_CODE_ESC == c)
+ {
+ /* Allow the escape sequence parser to end the UTF-8 state. */
+ flags |= ISO_STATE_ESCAPE;
+ data->esc = ISO_ESC;
+ data->esc_bytes_index = 1;
+ continue;
+ }
+
+ switch (counter)
+ {
+ case 0:
+ if (c >= 0xfc)
+ {
+ ch = c & 0x01;
+ counter = 5;
+ }
+ else if (c >= 0xf8)
+ {
+ ch = c & 0x03;
+ counter = 4;
+ }
+ else if (c >= 0xf0)
+ {
+ ch = c & 0x07;
+ counter = 3;
+ }
+ else if (c >= 0xe0)
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ }
+ else if (c >= 0xc0)
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ }
+ else
+ /* ASCII, or the lower control characters. */
+ Dynarr_add (dst, c);
+
+ break;
+ case 1:
+ ch = (ch << 6) | (c & 0x3f);
+ chr = Funicode_to_char(make_int(ch), Qnil);
+
+ if (!NILP (chr))
+ {
+ assert(CHARP(chr));
+ len = set_itext_ichar (work, XCHAR(chr));
+ Dynarr_add_many (dst, work, len);
+ }
+ else
+ {
+ /* Shouldn't happen, this code should only be enabled in
+ XEmacsen with support for all of Unicode. */
+ Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
+ Dynarr_add (dst, 34 + 128);
+ Dynarr_add (dst, 46 + 128);
+ }
+
+ ch = 0;
+ counter = 0;
+ break;
+ default:
+ ch = (ch << 6) | (c & 0x3f);
+ counter--;
+ }
+
+ if (str->eof)
+ DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+
+ data->counter = counter;
+ }
else if (byte_c0_p (c) || byte_c1_p (c))
{ /* Control characters */
@@ -2010,6 +2173,7 @@ iso2022_designate (Lisp_Object charset,
}
Dynarr_add (dst, ISO_CODE_ESC);
+
switch (type)
{
case CHARSET_TYPE_94:
@@ -2102,6 +2266,14 @@ iso2022_encode (struct coding_stream *st
{ /* Processing ASCII character */
ch = 0;
+ if (flags & ISO_STATE_UTF_8)
+ {
+ Dynarr_add (dst, ISO_CODE_ESC);
+ Dynarr_add (dst, '%');
+ Dynarr_add (dst, '@');
+ flags &= ~(ISO_STATE_UTF_8);
+ }
+
restore_left_to_right_direction (codesys, dst, &flags, 0);
/* Make sure G0 contains ASCII */
@@ -2145,18 +2317,43 @@ iso2022_encode (struct coding_stream *st
Dynarr_add (dst, c);
char_boundary = 1;
}
-
else if (ibyte_leading_byte_p (c) || ibyte_leading_byte_p (ch))
{ /* Processing Leading Byte */
ch = 0;
charset = charset_by_leading_byte (c);
if (leading_byte_prefix_p (c))
- ch = c;
+ {
+ ch = c;
+ }
+ else if (XCHARSET_ENCODE_AS_UTF_8 (charset))
+ {
+ assert (!EQ (charset, Vcharset_control_1)
+ && !EQ (charset, Vcharset_composite));
+
+ /* If the character set is to be encoded as UTF-8, the escape
+ is always the same. */
+ if (!(flags & ISO_STATE_UTF_8))
+ {
+ Dynarr_add (dst, ISO_CODE_ESC);
+ Dynarr_add (dst, '%');
+ Dynarr_add (dst, 'G');
+ flags |= ISO_STATE_UTF_8;
+ }
+ }
else if (!EQ (charset, Vcharset_control_1)
&& !EQ (charset, Vcharset_composite))
{
int reg;
+ /* End the UTF-8 state. */
+ if (flags & ISO_STATE_UTF_8)
+ {
+ Dynarr_add (dst, ISO_CODE_ESC);
+ Dynarr_add (dst, '%');
+ Dynarr_add (dst, '@');
+ flags &= ~(ISO_STATE_UTF_8);
+ }
+
ensure_correct_direction (XCHARSET_DIRECTION (charset),
codesys, dst, &flags, 0);
@@ -2274,12 +2471,14 @@ iso2022_encode (struct coding_stream *st
switch (XCHARSET_REP_BYTES (charset))
{
case 2:
- Dynarr_add (dst, c & charmask);
+ dynarr_add_2022_one_dimension (charset, c,
+ charmask, dst);
break;
case 3:
if (XCHARSET_PRIVATE_P (charset))
{
- Dynarr_add (dst, c & charmask);
+ dynarr_add_2022_one_dimension (charset, c,
+ charmask, dst);
ch = 0;
}
else if (ch)
@@ -2287,6 +2486,9 @@ iso2022_encode (struct coding_stream *st
#ifdef ENABLE_COMPOSITE_CHARS
if (EQ (charset, Vcharset_composite))
{
+ /* #### Hasn't been written to handle composite
+ characters yet. */
+ assert(!XCHARSET_ENCODE_AS_UTF_8 (charset))
if (in_composite)
{
/* #### Bother! We don't know how to
@@ -2310,8 +2512,8 @@ iso2022_encode (struct coding_stream *st
else
#endif /* ENABLE_COMPOSITE_CHARS */
{
- Dynarr_add (dst, ch & charmask);
- Dynarr_add (dst, c & charmask);
+ dynarr_add_2022_two_dimensions (charset, c, ch,
+ charmask, dst);
}
ch = 0;
}
@@ -2324,8 +2526,8 @@ iso2022_encode (struct coding_stream *st
case 4:
if (ch)
{
- Dynarr_add (dst, ch & charmask);
- Dynarr_add (dst, c & charmask);
+ dynarr_add_2022_two_dimensions (charset, c, ch,
+ charmask, dst);
ch = 0;
}
else
1.42 +4 -0 XEmacs/xemacs/src/redisplay-x.c
Index: redisplay-x.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/redisplay-x.c,v
retrieving revision 1.41
retrieving revision 1.42
diff -u -p -r1.41 -r1.42
--- redisplay-x.c 2005/11/26 11:46:10 1.41
+++ redisplay-x.c 2006/06/03 17:50:55 1.42
@@ -230,6 +230,10 @@ separate_textual_runs (unsigned char *te
}
#endif /* MULE */
*text_storage++ = (unsigned char) byte1;
+ /* This dimension stuff is broken if you want to use a two-dimensional
+ X11 font to display a single-dimensional character set, as is
+ appropriate for the IPA (use one of the -iso10646-1 fonts) or some
+ of the other non-standard character sets. */
if (dimension == 2)
*text_storage++ = (unsigned char) byte2;
#else /* USE_XFT */
1.33 +136 -43 XEmacs/xemacs/src/unicode.c
Index: unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.32
retrieving revision 1.33
diff -u -p -r1.32 -r1.33
--- unicode.c 2006/04/22 15:18:55 1.32
+++ unicode.c 2006/06/03 17:50:55 1.33
@@ -321,6 +321,10 @@ Lisp_Object Vdefault_unicode_precedence_
Lisp_Object Qignore_first_column;
+Lisp_Object Vcurrent_jit_charset;
+Lisp_Object Qlast_allocated_character;
+Lisp_Object Qccl_encode_to_ucs_2;
+
/************************************************************************/
/* Unicode implementation */
@@ -1001,12 +1005,72 @@ ichar_to_unicode (Ichar chr)
}
static Ichar
+get_free_codepoint(Lisp_Object charset)
+{
+ Lisp_Object name = Fcharset_name(charset);
+ Lisp_Object zeichen = Fget(name, Qlast_allocated_character, Qnil);
+ Ichar res;
+
+ /* Only allow this with the 96x96 character sets we are using for
+ temporary Unicode support. */
+ assert(2 == XCHARSET_DIMENSION(charset) && 96 == XCHARSET_CHARS(charset));
+
+ if (!NILP(zeichen))
+ {
+ int c1, c2;
+
+ BREAKUP_ICHAR(XCHAR(zeichen), charset, c1, c2);
+
+ if (127 == c1 && 127 == c2)
+ {
+ /* We've already used the hightest-numbered character in this
+ set--tell our caller to create another. */
+ return -1;
+ }
+
+ if (127 == c2)
+ {
+ ++c1;
+ c2 = 0x20;
+ }
+ else
+ {
+ ++c2;
+ }
+
+ res = make_ichar(charset, c1, c2);
+ Fput(name, Qlast_allocated_character, make_char(res));
+ }
+ else
+ {
+ res = make_ichar(charset, 32, 32);
+ Fput(name, Qlast_allocated_character, make_char(res));
+ }
+ return res;
+}
+
+/* The just-in-time creation of XEmacs characters that correspond to unknown
+ Unicode code points happens when:
+
+ 1. The lookup would otherwise fail.
+
+ 2. The charsets array is the nil or the default.
+
+ If there are no free code points in the just-in-time Unicode character
+ set, and the charsets array is the default unicode precedence list,
+ create a new just-in-time Unicode character set, add it at the end of the
+ unicode precedence list, create the XEmacs character in that character
+ set, and return it. */
+
+static Ichar
unicode_to_ichar (int code, Lisp_Object_dynarr *charsets)
{
int u1, u2, u3, u4;
int code_levels;
int i;
int n = Dynarr_length (charsets);
+ static int number_of_jit_charsets;
+ static Ascbyte last_jit_charset_final;
type_checking_assert (code >= 0);
/* This shortcut depends on the representation of an Ichar, see text.c.
@@ -1040,8 +1104,64 @@ unicode_to_ichar (int code, Lisp_Object_
return make_ichar (charset, retval >> 8, retval & 0xFF);
}
}
+
+ /* Only do the magic just-in-time assignment if we're using the default
+ list. */
+ if (unicode_precedence_dynarr == charsets)
+ {
+ if (NILP (Vcurrent_jit_charset) ||
+ (-1 == (i = get_free_codepoint(Vcurrent_jit_charset))))
+ {
+ Ascbyte setname[32];
+ Lisp_Object charset_descr = build_string
+ ("Mule charset for otherwise unknown Unicode code points.");
+ Lisp_Object charset_regr = build_string("iso10646-1");
+
+ struct gcpro gcpro1, gcpro2;
+
+ if ('\0' == last_jit_charset_final)
+ {
+ /* This final byte shit is, umm, not that cool. */
+ last_jit_charset_final = 0x30;
+ }
- return (Ichar) -1;
+ snprintf(setname, sizeof(setname),
+ "jit-ucs-charset-%d", number_of_jit_charsets++);
+
+ /* Aside: GCPROing here would be overkill according to the FSF's
+ philosophy. make-charset cannot currently GC, but is intended
+ to be called from Lisp, with its arguments protected by the
+ Lisp reader. We GCPRO in case it GCs in the future and no-one
+ checks all the C callers. */
+
+ GCPRO2 (charset_descr, charset_regr);
+ Vcurrent_jit_charset = Fmake_charset
+ (intern(setname), charset_descr,
+ /* Set encode-as-utf-8 to t, to have this character set written
+ using UTF-8 escapes in escape-quoted and ctext. This
+ sidesteps the fact that our internal character -> Unicode
+ mapping is not stable from one invocation to the next. */
+ nconc2 (list2(Qencode_as_utf_8, Qt),
+ nconc2 (list6(Qcolumns, make_int(1), Qchars, make_int(96),
+ Qdimension, make_int(2)),
+ list6(Qregistry, charset_regr,
+ Qfinal, make_char(last_jit_charset_final++),
+ /* This CCL program is initialised in
+ unicode.el. */
+ Qccl_program, Qccl_encode_to_ucs_2))));
+ UNGCPRO;
+
+ i = get_free_codepoint(Vcurrent_jit_charset);
+ }
+
+ if (-1 != i)
+ {
+ set_unicode_conversion((Ichar)i, code);
+ /* No need to add the charset to the end of the list; it's done
+ automatically. */
+ }
+ }
+ return (Ichar) i;
}
/* Add charsets to precedence list.
@@ -1284,37 +1404,13 @@ When there is no international support (
present), this function simply does `int-to-char' and ignores the CHARSETS
argument.
-Note that the current XEmacs internal encoding has no mapping for many
-Unicode code points, and if you use characters that are vaguely obscure with
-XEmacs' Unicode coding systems, you will lose data.
-
-To add support for some desired code point in the short term--note that our
-intention is to move to a Unicode-compatible internal encoding soon, for
-some value of soon--if you are a distributor, add something like the
-following to `site-start.el.'
-
-(make-charset 'distro-name-private
- "Private character set for DISTRO"
- '(dimension 1
- chars 96
- columns 1
- final ?5 ;; Change this--see docs for make-charset
- long-name "Private charset for some Unicode char support."
- short-name "Distro-Private"))
-
-(set-unicode-conversion
- (make-char 'distro-name-private #x20) #x263A) ;; WHITE SMILING FACE
-
-(set-unicode-conversion
- (make-char 'distro-name-private #x21) #x3030) ;; WAVY DASH
-
-;; ...
-;;; Repeat as necessary.
-
-Redisplay will work on the sjt-xft branch, but not with server-side X11
-fonts as is the default. However, data read in will be preserved when they
-are written out again.
-
+If the CODE would not otherwise be converted to an XEmacs character, and the
+list of character sets to be consulted is nil or the default, a new XEmacs
+character will be created for it in one of the `jit-ucs-charset' Mule
+character sets, and that character will be returned. There is scope for
+tens of thousands of separate Unicode code points in every session using
+this technique, so despite XEmacs' internal encoding not being based on
+Unicode, your data won't be trashed.
*/
(code, USED_IF_MULE (charsets)))
{
@@ -1558,16 +1654,6 @@ Unicode tables or in the charset:
/* Unicode coding system */
/************************************************************************/
-/* ISO 10646 UTF-16, UCS-4, UTF-8, UTF-7, etc. */
-
-enum unicode_type
-{
- UNICODE_UTF_16,
- UNICODE_UTF_8,
- UNICODE_UTF_7,
- UNICODE_UCS_4
-};
-
struct unicode_coding_system
{
enum unicode_type type;
@@ -1728,7 +1814,9 @@ encode_unicode_char_1 (int code, unsigne
}
}
-static void
+/* Also used in mule-coding.c for UTF-8 handling in ISO 2022-oriented
+ encodings. */
+void
encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
int USED_IF_MULE (l), unsigned_char_dynarr *dst,
enum unicode_type type, unsigned int little_endian)
@@ -2444,6 +2532,8 @@ syms_of_unicode (void)
DEFSUBR (Fload_unicode_mapping_table);
+ DEFSYMBOL (Qccl_encode_to_ucs_2);
+ DEFSYMBOL (Qlast_allocated_character);
DEFSYMBOL (Qignore_first_column);
#endif /* MULE */
@@ -2518,6 +2608,9 @@ vars_of_unicode (void)
&lisp_object_dynarr_description);
init_blank_unicode_tables ();
+
+ staticpro (&Vcurrent_jit_charset);
+ Vcurrent_jit_charset = Qnil;
/* Note that the "block" we are describing is a single pointer, and hence
we could potentially use dump_add_root_block_ptr(). However, given
1.80 +10 -0 XEmacs/xemacs/tests/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/tests/ChangeLog,v
retrieving revision 1.79
retrieving revision 1.80
diff -u -p -r1.79 -r1.80
--- ChangeLog 2006/05/16 08:25:02 1.79
+++ ChangeLog 2006/06/03 17:51:03 1.80
@@ -1,3 +1,13 @@
+2006-06-03 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * automated/mule-tests.el:
+ Add checks that several Unicode characters, expressed as
+ UTF-8-encoded strings, are handled correctly by the UTF-8 support
+ of the escape-quoted character set.
+ * automated/mule-tests.el (unicode-code-point-to-utf-8-string): New.
+ Convert a Unicode code point to the equivalent UTF-8 string.
+ This is a naive implementation in Lisp.
+
2006-05-16 Stephen J. Turnbull <stephen(a)xemacs.org>
* XEmacs 21.5.27 "fiddleheads" is released.
1.9 +63 -0 XEmacs/xemacs/tests/automated/mule-tests.el
Index: mule-tests.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/tests/automated/mule-tests.el,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -p -r1.8 -r1.9
--- mule-tests.el 2004/04/19 06:22:34 1.8
+++ mule-tests.el 2006/06/03 17:51:06 1.9
@@ -78,6 +78,38 @@ the Assert macro checks for correctness.
;; once compiled, for no good reason.
(test-chars t)
+(defun unicode-code-point-to-utf-8-string (code-point)
+ "Convert a Unicode code point to the equivalent UTF-8 string.
+This is a naive implementation in Lisp. "
+ (check-argument-type 'natnump code-point)
+ (check-argument-range code-point 0 #x1fffff)
+ (if (< code-point #x80)
+ (format "%c" code-point)
+ (if (< code-point #x800)
+ (format "%c%c"
+ ;; ochars[0] = 0xC0 | (input & ~(0xFFFFF83F)) >> 6;
+ (logior #xc0 (lsh (logand code-point #x7c0) -6))
+ ;; ochars[1] = 0x80 | input & ~(0xFFFFFFC0);
+ (logior #x80 (logand code-point #x3f)))
+ (if (< code-point #x00010000)
+ (format "%c%c%c"
+ ;; ochars[0] = 0xE0 | (input >> 12) & ~(0xFFFFFFF0);
+ (logior #xe0 (logand (lsh code-point -12) #x0f))
+ ;; ochars[1] = 0x80 | (input >> 6) & ~(0xFFFFFFC0);
+ (logior #x80 (logand (lsh code-point -6) #x3f))
+ ;; ochars[2] = 0x80 | input & ~(0xFFFFFFC0);
+ (logior #x80 (logand code-point #x3f)))
+ (if (< code-point #x200000)
+ (format "%c%c%c%c"
+ ;; ochars[0] = 0xF0 | (input >> 18) & ~(0xFFFFFFF8)
+ (logior #xF0 (logand (lsh code-point -18) #x7))
+ ;; ochars[1] = 0x80 | (input >> 12) & ~(0xFFFFFFC0);
+ (logior #x80 (logand (lsh code-point -12) #x3f))
+ ;; ochars[2] = 0x80 | (input >> 6) & ~(0xFFFFFFC0);
+ (logior #x80 (logand (lsh code-point -6) #x3f))
+ ;; ochars[3] = 0x80 | input & ~(0xFFFFFFC0);
+ (logior #x80 (logand code-point #x3f))))))))
+
;;-----------------------------------------------------------------
;; Test string modification functions that modify the length of a char.
;;-----------------------------------------------------------------
@@ -335,6 +367,37 @@ the Assert macro checks for correctness.
(Assert (eq scaron (unicode-to-char code '(latin-iso8859-2))))))
(Check-Error wrong-type-argument (set-unicode-conversion scaron -10000)))
+
+ (dolist (utf-8-char
+ '("\xc6\x92" ;; U+0192 LATIN SMALL LETTER F WITH HOOK
+ "\xe2\x81\x8a" ;; U+204A TIRONIAN SIGN ET
+ "\xe2\x82\xae" ;; U+20AE TUGRIK SIGN
+ "\xf0\x9d\x92\xbd" ;; U+1D4BD MATHEMATICAL SCRIPT SMALL H
+ "\xf0\x9d\x96\x93" ;; U+1D593 MATHEMATICAL BOLD FRAKTUR SMALL N
+ "\xf0\xaf\xa8\x88" ;; U+2FA08 CJK COMPATIBILITY FOR U+4BCE
+ "\xf4\x8f\xbf\xbd")) ;; U+10FFFD <Plane 16 Private Use, Last>
+ (let* ((xemacs-character (car (append
+ (decode-coding-string utf-8-char 'utf-8)
+ nil)))
+ (xemacs-charset (car (split-char xemacs-character))))
+
+ ;; Trivial test of the UTF-8 support of the escape-quoted character set.
+ (Assert (equal (decode-coding-string utf-8-char 'utf-8)
+ (decode-coding-string (concat "\033%G" utf-8-char)
+ 'escape-quoted)))
+
+ ;; Check that the reverse mapping holds.
+ (Assert (equal (unicode-code-point-to-utf-8-string
+ (encode-char xemacs-character 'ucs))
+ utf-8-char))
+
+ ;; Check that, if this character has been JIT-allocated, it is encoded
+ ;; in escape-quoted using the corresponding UTF-8 escape.
+ (when (charset-property xemacs-charset 'encode-as-utf-8)
+ (Assert (equal (concat "\033%G" utf-8-char)
+ (encode-coding-string xemacs-character 'escape-quoted)))
+ (Assert (equal (concat "\033%G" utf-8-char)
+ (encode-coding-string xemacs-character 'ctext))))))
;;---------------------------------------------------------------
;; Test charset-in-* functions