User: aidan
Date: 06/07/08 01:01:15
Modified: xemacs/src ChangeLog charset.h
Log:
Adjust the Mule charsets to support 500,000 unknown Unicode charsets.
Revision Changes Path
1.344 +8 -0 XEmacs/xemacs/man/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/ChangeLog,v
retrieving revision 1.343
retrieving revision 1.344
diff -u -p -r1.343 -r1.344
--- ChangeLog 2006/06/03 17:50:46 1.343
+++ ChangeLog 2006/07/07 23:01:01 1.344
@@ -1,3 +1,11 @@
+2006-07-08 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * internals/internals.texi (Internal String Encoding):
+ Mention that UTF-8 would be a reasonable alternative encoding.
+ * internals/internals.texi (Internal Character Encoding):
+ Re-arrange the description of characters to deal with 21-bit
+ characters.
+
2006-06-03 Aidan Kehoe <kehoea(a)parhasard.net>
* lispref/mule.texi (CCL Syntax):
1.76 +13 -11 XEmacs/xemacs/man/internals/internals.texi
Index: internals.texi
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/man/internals/internals.texi,v
retrieving revision 1.75
retrieving revision 1.76
diff -u -p -r1.75 -r1.76
--- internals.texi 2006/04/23 16:11:28 1.75
+++ internals.texi 2006/07/07 23:01:06 1.76
@@ -11335,10 +11335,12 @@ were encoded in a one-byte-per-character
the actual multi-byte encoding.
@end enumerate
- None of the standard non-modal encodings meet all of these
+ None of the pre-Unciode standard non-modal encodings meet all of these
conditions. For example, EUC satisfies only (2) and (3), while
-Shift-JIS and Big5 (not yet described) satisfy only (2). (All
-non-modal encodings must satisfy (2), in order to be unambiguous.)
+Shift-JIS and Big5 (not yet described) satisfy only (2). (All non-modal
+encodings must satisfy (2), in order to be unambiguous.) UTF-8,
+however, meets all three, and we are considering moving to it as an
+internal encoding.
@node Internal Character Encoding, , Internal String Encoding, Internal Mule Encodings
@subsection Internal Character Encoding
@@ -11346,16 +11348,16 @@ non-modal encodings must satisfy (2), in
@cindex character encoding, internal
@cindex encoding, internal character
- One 19-bit word represents a single character. The word is
+ One 21-bit word represents a single character. The word is
separated into three fields:
@example
-Bit number: 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
- <------------> <------------------>
<------------------>
-Field: 1 2 3
+Bit number: 20 19 18 17 16 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+ <------------------> <------------------>
<------------------>
+Field: 1 2 3
@end example
- Note that fields 2 and 3 hold 7 bits each, while field 1 holds 5 bits.
+ Note that each field holds 7 bits.
@example
Character set Field 1 Field 2 Field 3
@@ -11370,12 +11372,12 @@ Dimension-1 private 0
range: (20 - 6F) (20 - 7F)
Dimension-2 official LB - 0x8F PC1 PC2
range: (01 - 0A) (20 - 7F) (20 - 7F)
-Dimension-2 private LB - 0xE1 PC1 PC2
+Dimension-2 private LB - 0x80 PC1 PC2
range: (0F - 1E) (20 - 7F) (20 - 7F)
Composite 0x1F ? ?
@end example
-Note that character codes 0 - 255 are the same as the ``binary
+Note also that character codes 0 - 255 are the same as the ``binary
encoding'' described above.
Most of the code in XEmacs knows nothing of the representation of a
@@ -11607,7 +11609,7 @@ the same as the representation of that s
you cannot do the standard C trick of passing a pointer to a character
to a function that expects a string.
-An Ichar takes up 19 bits of representation and (for code compatibility
+An Ichar takes up 21 bits of representation and (for code compatibility
and such) is compatible with an int. This representation is visible on
the Lisp level. The important characteristics of the Ichar
representation are
1.984 +5 -0 XEmacs/xemacs/src/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/ChangeLog,v
retrieving revision 1.983
retrieving revision 1.984
diff -u -p -r1.983 -r1.984
--- ChangeLog 2006/07/07 19:51:13 1.983
+++ ChangeLog 2006/07/07 23:01:10 1.984
@@ -1,3 +1,8 @@
+2006-07-08 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * charset.h:
+ Move to 7 bits instead of 5 for the first field of a character.
+
2006-07-07 Aidan Kehoe <kehoea(a)parhasard.net>
* config.h.in:
1.14 +6 -6 XEmacs/xemacs/src/charset.h
Index: charset.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/charset.h,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -p -r1.13 -r1.14
--- charset.h 2006/06/03 17:50:54 1.13
+++ charset.h 2006/07/07 23:01:11 1.14
@@ -140,8 +140,8 @@ enum LEADING_BYTE_OFFICIAL_2
#define PRE_LEADING_BYTE_PRIVATE_2 0x9F /* 2-byte char-set */
#define MIN_LEADING_BYTE_PRIVATE_1 0xA0
-#define MAX_LEADING_BYTE_PRIVATE_1 0xEF
-#define MIN_LEADING_BYTE_PRIVATE_2 0xF0
+#define MAX_LEADING_BYTE_PRIVATE_1 0xC0
+#define MIN_LEADING_BYTE_PRIVATE_2 0xC1
#define MAX_LEADING_BYTE_PRIVATE_2 0xFF
#define NUM_LEADING_BYTES 129
@@ -354,9 +354,9 @@ charset_by_attributes (int type, int fin
/************************************************************************/
/* The bit fields of character are divided into 3 parts:
- FIELD1(5bits):FIELD2(7bits):FIELD3(7bits) */
+ FIELD1(7bits):FIELD2(7bits):FIELD3(7bits) */
-#define ICHAR_FIELD1_MASK (0x1F << 14)
+#define ICHAR_FIELD1_MASK (0x7F << 14)
#define ICHAR_FIELD2_MASK (0x7F << 7)
#define ICHAR_FIELD3_MASK 0x7F
@@ -376,7 +376,7 @@ charset_by_attributes (int type, int fin
#define FIELD2_TO_PRIVATE_LEADING_BYTE 0x80
#define FIELD1_TO_OFFICIAL_LEADING_BYTE (MIN_LEADING_BYTE_OFFICIAL_2 - 1)
-#define FIELD1_TO_PRIVATE_LEADING_BYTE 0xE1
+#define FIELD1_TO_PRIVATE_LEADING_BYTE 0x80
/* Minimum and maximum allowed values for the fields. */
@@ -406,7 +406,7 @@ charset_by_attributes (int type, int fin
#define MIN_CHAR_PRIVATE_TYPE9N (MIN_ICHAR_FIELD2_PRIVATE << 7)
#define MIN_CHAR_OFFICIAL_TYPE9NX9N (MIN_ICHAR_FIELD1_OFFICIAL << 14)
#define MIN_CHAR_PRIVATE_TYPE9NX9N (MIN_ICHAR_FIELD1_PRIVATE << 14)
-#define MIN_CHAR_COMPOSITION (0x1F << 14)
+#define MIN_CHAR_COMPOSITION (0x7F << 14)
/* Leading byte of a character.