I’m running with this code, and it--combined with the following Lisp--makes
IRCing on a channel where most of the traffic is in UTF-8, but some people
occasionally send ISO 8859-1 much more comfortable.
I’m not certain about the sanity of the UTF-16 part of this, mind. More work
to be done.
(loop for i from #x00 to #xFF
with glyph = nil
with char-table = (specifier-instance current-display-table)
do (setq glyph (make-glyph (vector 'string :data (string i))))
(set-glyph-face glyph 'red)
(put-char-table (decode-char 'ucs (+ #x200000 i)) glyph char-table))
src/ChangeLog addition:
2007-07-21 Aidan Kehoe <kehoea(a)parhasard.net>
* charset.h:
* mule-coding.c:
* mule-coding.c (dynarr_add_2022_one_dimension):
* mule-coding.c (dynarr_add_2022_two_dimensions):
* mule-coding.c (struct iso2022_coding_stream):
* mule-coding.c (decode_unicode_char):
* mule-coding.c (indicate_invalid_utf_8):
* mule-coding.c (iso2022_decode):
* unicode.c:
* unicode.c (struct unicode_coding_stream):
* unicode.c (decode_unicode_char):
* unicode.c (DECODE_ERROR_OCTET):
* unicode.c (indicate_invalid_utf_8):
* unicode.c (encode_unicode_char_1):
* unicode.c (encode_unicode_char):
* unicode.c (unicode_convert):
Make UTF-8 handling more robust; indicate error sequences when
decoding, passing the octets as distinct from the corresponding
ISO8859-1 characters, and (by default) writing them to disk on
encoding. Don't accept UTF-8 sequences longer than four octets on
reading in the utf-8 coding system; do accept them in the ISO IR 196
handling, since we decode Unicode error sequences to "Unicode" code
points starting at 0x200000, and will need to save them as such
in escape-quoted.
This change means that when a non-UTF-8 file is opened as UTF-8,
one change made, and immediately saved, the non-ASCII characters
are not corrupted. In Europe, this is a distinct win.
Don't error on invalid UTF-16 sequences; pass them through, using
the same error octets.
XEmacs Trunk source patch:
Diff command: cvs -q diff -Nu
Files affected: src/unicode.c
===================================================================
RCS src/mule-coding.c
===================================================================
RCS src/charset.h
===================================================================
RCS
Index: src/charset.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/charset.h,v
retrieving revision 1.16
diff -u -u -r1.16 charset.h
--- src/charset.h 2006/11/12 13:40:07 1.16
+++ src/charset.h 2007/07/21 15:03:21
@@ -572,7 +572,10 @@
void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
int USED_IF_MULE (l), unsigned_char_dynarr *dst,
- enum unicode_type type, unsigned int little_endian);
+ enum unicode_type type, unsigned int little_endian,
+ int write_error_characters_as_such);
+
+#define UNICODE_ERROR_OCTET_RANGE_START 0x200000
void set_charset_registries(Lisp_Object charset, Lisp_Object registries);
Index: src/mule-coding.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mule-coding.c,v
retrieving revision 1.39
diff -u -u -r1.39 mule-coding.c
--- src/mule-coding.c 2006/11/23 13:43:19 1.39
+++ src/mule-coding.c 2007/07/21 15:04:59
@@ -104,7 +104,7 @@
if (XCHARSET_ENCODE_AS_UTF_8 (charset))
{
encode_unicode_char (charset, c & charmask, 0,
- dst, UNICODE_UTF_8, 0);
+ dst, UNICODE_UTF_8, 0, 0);
}
else
{
@@ -123,7 +123,7 @@
encode_unicode_char (charset,
ch & charmask,
c & charmask, dst,
- UNICODE_UTF_8, 0);
+ UNICODE_UTF_8, 0, 0);
}
else
{
@@ -969,6 +969,7 @@
/* Used for handling UTF-8. */
unsigned char counter;
+ unsigned char indicated_length;
};
static const struct memory_description ccs_description_1[] =
@@ -1804,6 +1805,39 @@
}
}
+/* Note that this name conflicts with a function in unicode.c. */
+static void
+decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
+{
+ Ibyte work[MAX_ICHAR_LEN];
+ int len;
+ Lisp_Object chr;
+
+ chr = Funicode_to_char(make_int(ucs), Qnil);
+ assert (!NILP(chr));
+ len = set_itext_ichar (work, XCHAR(chr));
+ Dynarr_add_many (dst, work, len);
+}
+
+#define DECODE_ERROR_OCTET(octet, dst) \
+ decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+ unsigned char counter,
+ int ch, unsigned_char_dynarr *dst)
+{
+ Binbyte stored = indicated_length - counter;
+ Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+ while (stored > 0)
+ {
+ DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+ dst);
+ mask = 0x80, stored--;
+ }
+}
+
/* Convert ISO2022-format data to internal format. */
static Bytecount
@@ -1907,9 +1941,7 @@
else if (flags & ISO_STATE_UTF_8)
{
unsigned char counter = data->counter;
- Ibyte work[MAX_ICHAR_LEN];
- int len;
- Lisp_Object chr;
+ unsigned char indicated_length = data->indicated_length;
if (ISO_CODE_ESC == c)
{
@@ -1919,74 +1951,123 @@
data->esc_bytes_index = 1;
continue;
}
-
- switch (counter)
- {
- case 0:
- if (c >= 0xfc)
- {
- ch = c & 0x01;
- counter = 5;
- }
- else if (c >= 0xf8)
- {
- ch = c & 0x03;
- counter = 4;
- }
- else if (c >= 0xf0)
- {
- ch = c & 0x07;
- counter = 3;
- }
- else if (c >= 0xe0)
- {
- ch = c & 0x0f;
- counter = 2;
- }
- else if (c >= 0xc0)
- {
- ch = c & 0x1f;
- counter = 1;
- }
- else
- /* ASCII, or the lower control characters.
-
- Perhaps we should signal an error if the character is in
- the range 0x80-0xc0; this is illegal UTF-8. */
- Dynarr_add (dst, (c & 0x7f));
-
- break;
- case 1:
- ch = (ch << 6) | (c & 0x3f);
- chr = Funicode_to_char(make_int(ch), Qnil);
-
- if (!NILP (chr))
- {
- assert(CHARP(chr));
- len = set_itext_ichar (work, XCHAR(chr));
- Dynarr_add_many (dst, work, len);
- }
- else
- {
- /* Shouldn't happen, this code should only be enabled in
- XEmacsen with support for all of Unicode. */
- Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);
- Dynarr_add (dst, 34 + 128);
- Dynarr_add (dst, 46 + 128);
- }
-
- ch = 0;
- counter = 0;
- break;
- default:
- ch = (ch << 6) | (c & 0x3f);
- counter--;
- }
- if (str->eof)
- DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ if (0 == counter)
+ {
+ if (0 == (c & 0x80))
+ {
+ /* ASCII. */
+ decode_unicode_char (c, dst);
+ }
+ else if (0 == (c & 0x40))
+ {
+ /* Highest bit set, second highest not--there's
+ something wrong. */
+ DECODE_ERROR_OCTET (c, dst);
+ }
+ else if (0 == (c & 0x20))
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ indicated_length = 2;
+ }
+ else if (0 == (c & 0x10))
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ indicated_length = 3;
+ }
+ else if (0 == (c & 0x08))
+ {
+ ch = c & 0x0f;
+ counter = 3;
+ indicated_length = 4;
+ }
+ /* We support lengths longer than 4 here, since we want to
+ represent UTF-8 error chars as distinct from the
+ corresponding ISO 8859-1 characters in escape-quoted.
+
+ However, we can't differentiate UTF-8 error chars as
+ written to disk, and UTF-8 errors in escape-quoted. This
+ is not a big problem;
+ non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
+ deployed, in practice, so if such a sequence of octets
+ occurs, XEmacs generated it. */
+ else if (0 == (c & 0x04))
+ {
+ ch = c & 0x03;
+ counter = 4;
+ indicated_length = 5;
+ }
+ else if (0 == (c & 0x02))
+ {
+ ch = c & 0x01;
+ counter = 5;
+ indicated_length = 6;
+ }
+ else
+ {
+ /* #xFF is not a valid leading byte in any form of
+ UTF-8. */
+ DECODE_ERROR_OCTET (c, dst);
+
+ }
+ }
+ else
+ {
+ /* counter != 0 */
+ if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst);
+ if (c & 0x80)
+ {
+ DECODE_ERROR_OCTET (c, dst);
+ }
+ else
+ {
+ /* The character just read is ASCII. Treat it as
+ such. */
+ decode_unicode_char (c, dst);
+ }
+ ch = 0;
+ counter = 0;
+ }
+ else
+ {
+ ch = (ch << 6) | (c & 0x3f);
+ counter--;
+ /* Just processed the final byte. Emit the character,
+ avoiding over-long sequences. */
+ if (!counter)
+ {
+ if ((ch < 0x80) ||
+ ((ch < 0x800) && indicated_length > 2) ||
+ ((ch < 0x1000) && indicated_length > 3) ||
+ ((ch < 0x10000) && indicated_length > 4))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst);
+ }
+ else
+ {
+ decode_unicode_char (ch, dst);
+ }
+ ch = 0;
+ }
+ }
+ }
+
+ if (str->eof && ch)
+ {
+ DECODE_ERROR_OCTET (ch, dst);
+ ch = 0;
+ }
data->counter = counter;
+ data->indicated_length = indicated_length;
}
else if (byte_c0_p (c) || byte_c1_p (c))
{ /* Control characters */
Index: src/unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.37
diff -u -u -r1.37 unicode.c
--- src/unicode.c 2007/05/13 11:11:30 1.37
+++ src/unicode.c 2007/07/21 15:05:45
@@ -146,13 +146,6 @@
(1) User-defined charsets: It would be inconvenient to require all
dumped user-defined charsets to be reloaded at init time.
- (2) Starting up in a non-ISO-8859-1 directory. If we load at run-time,
- we don't load the tables until after we've parsed the current
- directories, and we run into a real bootstrapping problem, if the
- directories themselves are non-ISO-8859-1. This is potentially fixable
- once we switch to using Unicode internally, so we don't have to do any
- conversion (other than the automatic kind, e.g. UTF-16 to UTF-8).
-
NB With run-time loading, we load in init-mule-at-startup, in
mule-cmds.el. This is called from startup.el, which is quite late in
the initialization process -- but data-directory isn't set until then.
@@ -1703,6 +1696,7 @@
{
/* decode */
unsigned char counter;
+ unsigned char indicated_length;
int seen_char;
/* encode */
Lisp_Object current_charset;
@@ -1716,11 +1710,6 @@
DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode);
-/* Decode a UCS-2 or UCS-4 character into a buffer. If the lookup fails, use
- <GETA MARK> (U+3013) of JIS X 0208, which means correct character
- is not found, instead.
- #### do something more appropriate (use blob?)
- Danger, Will Robinson! Data loss. Should we signal user? */
static void
decode_unicode_char (int ch, unsigned_char_dynarr *dst,
struct unicode_coding_stream *data,
@@ -1755,9 +1744,32 @@
data->seen_char = 1;
}
+#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \
+ decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \
+ dst, data, ignore_bom)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+ unsigned char counter,
+ int ch, unsigned_char_dynarr *dst,
+ struct unicode_coding_stream *data,
+ unsigned int ignore_bom)
+{
+ Binbyte stored = indicated_length - counter;
+ Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+ while (stored > 0)
+ {
+ DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+ dst, data, ignore_bom);
+ mask = 0x80, stored--;
+ }
+}
+
static void
encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
- enum unicode_type type, unsigned int little_endian)
+ enum unicode_type type, unsigned int little_endian,
+ int write_error_characters_as_such)
{
switch (type)
{
@@ -1768,16 +1780,25 @@
Dynarr_add (dst, (unsigned char) (code & 255));
Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
} else {
- /* Little endian; least significant byte first. */
- int first, second;
-
- CODE_TO_UTF_16_SURROGATES(code, first, second);
-
- Dynarr_add (dst, (unsigned char) (first & 255));
- Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
-
- Dynarr_add (dst, (unsigned char) (second & 255));
- Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ /* Little endian; least significant byte first. */
+ int first, second;
+
+ CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+ Dynarr_add (dst, (unsigned char) (first & 255));
+ Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+
+ Dynarr_add (dst, (unsigned char) (second & 255));
+ Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ }
}
}
else
@@ -1786,16 +1807,25 @@
Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
Dynarr_add (dst, (unsigned char) (code & 255));
} else {
- /* Big endian; most significant byte first. */
- int first, second;
-
- CODE_TO_UTF_16_SURROGATES(code, first, second);
-
- Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (first & 255));
-
- Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (second & 255));
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ /* Big endian; most significant byte first. */
+ int first, second;
+
+ CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+ Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (first & 255));
+
+ Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (second & 255));
+ }
}
}
break;
@@ -1803,17 +1833,35 @@
case UNICODE_UCS_4:
if (little_endian)
{
- Dynarr_add (dst, (unsigned char) (code & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
- Dynarr_add (dst, (unsigned char) (code >> 24));
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+ Dynarr_add (dst, (unsigned char) (code >> 24));
+ }
}
else
{
- Dynarr_add (dst, (unsigned char) (code >> 24));
- Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
- Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
- Dynarr_add (dst, (unsigned char) (code & 255));
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ Dynarr_add (dst, (unsigned char) (code >> 24));
+ Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+ Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+ Dynarr_add (dst, (unsigned char) (code & 255));
+ }
}
break;
@@ -1842,11 +1890,25 @@
}
else if (code <= 0x3ffffff)
{
- Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
- Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
- Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
- Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) | 0x80));
- Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
+
+#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
+ && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
+#error "This code needs to be rewritten. "
+#endif
+ if (write_error_characters_as_such &&
+ code >= UNICODE_ERROR_OCTET_RANGE_START &&
+ code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+ {
+ Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+ }
+ else
+ {
+ Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
+ Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) |
0x80));
+ Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) |
0x80));
+ Dynarr_add (dst, (unsigned char) (((code >> 6) & 0x3f) |
0x80));
+ Dynarr_add (dst, (unsigned char) ((code & 0x3f) | 0x80));
+ }
}
else
{
@@ -1870,7 +1932,8 @@
void
encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
int USED_IF_MULE (l), unsigned_char_dynarr *dst,
- enum unicode_type type, unsigned int little_endian)
+ enum unicode_type type, unsigned int little_endian,
+ int write_error_characters_as_such)
{
#ifdef MULE
int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127));
@@ -1896,7 +1959,8 @@
int code = h;
#endif /* MULE */
- encode_unicode_char_1 (code, dst, type, little_endian);
+ encode_unicode_char_1 (code, dst, type, little_endian,
+ write_error_characters_as_such);
}
static Bytecount
@@ -1915,6 +1979,8 @@
if (str->direction == CODING_DECODE)
{
unsigned char counter = data->counter;
+ unsigned char indicated_length
+ = data->indicated_length;
while (n--)
{
@@ -1923,46 +1989,91 @@
switch (type)
{
case UNICODE_UTF_8:
- switch (counter)
- {
- case 0:
- if (c >= 0xfc)
- {
- ch = c & 0x01;
- counter = 5;
- }
- else if (c >= 0xf8)
- {
- ch = c & 0x03;
- counter = 4;
- }
- else if (c >= 0xf0)
- {
- ch = c & 0x07;
- counter = 3;
- }
- else if (c >= 0xe0)
- {
- ch = c & 0x0f;
- counter = 2;
- }
- else if (c >= 0xc0)
- {
- ch = c & 0x1f;
- counter = 1;
- }
- else
- decode_unicode_char (c, dst, data, ignore_bom);
- break;
- case 1:
- ch = (ch << 6) | (c & 0x3f);
- decode_unicode_char (ch, dst, data, ignore_bom);
- ch = 0;
- counter = 0;
- break;
- default:
- ch = (ch << 6) | (c & 0x3f);
- counter--;
+ if (0 == counter)
+ {
+ if (0 == (c & 0x80))
+ {
+ /* ASCII. */
+ decode_unicode_char (c, dst, data, ignore_bom);
+ }
+ else if (0 == (c & 0x40))
+ {
+ /* Highest bit set, second highest not--there's
+ something wrong. */
+ DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ }
+ else if (0 == (c & 0x20))
+ {
+ ch = c & 0x1f;
+ counter = 1;
+ indicated_length = 2;
+ }
+ else if (0 == (c & 0x10))
+ {
+ ch = c & 0x0f;
+ counter = 2;
+ indicated_length = 3;
+ }
+ else if (0 == (c & 0x08))
+ {
+ ch = c & 0x0f;
+ counter = 3;
+ indicated_length = 4;
+ }
+ else
+ {
+ /* We don't supports lengths longer than 4 in
+ external-format data. */
+ DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+
+ }
+ }
+ else
+ {
+ /* counter != 0 */
+ if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst, data, ignore_bom);
+ if (c & 0x80)
+ {
+ DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+ }
+ else
+ {
+ /* The character just read is ASCII. Treat it as
+ such. */
+ decode_unicode_char (c, dst, data, ignore_bom);
+ }
+ ch = 0;
+ counter = 0;
+ }
+ else
+ {
+ ch = (ch << 6) | (c & 0x3f);
+ counter--;
+ /* Just processed the final byte. Emit the character,
+ avoiding over-long sequences. */
+ if (!counter)
+ {
+ if ((ch < 0x80) ||
+ ((ch < 0x800) && indicated_length > 2) ||
+ ((ch < 0x1000) && indicated_length > 3) ||
+ ((ch < 0x10000) && indicated_length > 4))
+ {
+ indicate_invalid_utf_8(indicated_length,
+ counter,
+ ch, dst, data,
+ ignore_bom);
+ }
+ else
+ {
+ decode_unicode_char (ch, dst, data, ignore_bom);
+ }
+ ch = 0;
+ }
+ }
}
break;
@@ -1987,20 +2098,27 @@
if (counter == 32)
{
int tempch;
- /* #### Signalling an error may be a bit extreme. Should
- we try and read it in anyway? */
+
if (!valid_utf_16_first_surrogate(ch >> 16)
|| !valid_utf_16_last_surrogate(ch & 0xFFFF))
{
- signal_error(Qtext_conversion_error,
- "Invalid UTF-16 surrogate sequence",
- Qunbound);
+ DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
}
- tempch = utf_16_surrogates_to_code((ch >> 16),
- (ch & 0xffff));
+ else
+ {
+ tempch = utf_16_surrogates_to_code((ch >> 16),
+ (ch & 0xffff));
+ decode_unicode_char(tempch, dst, data, ignore_bom);
+ }
ch = 0;
counter = 0;
- decode_unicode_char(tempch, dst, data, ignore_bom);
}
break;
@@ -2012,15 +2130,37 @@
counter += 8;
if (counter == 32)
{
- int tempch = ch;
- ch = 0;
- counter = 0;
- if (tempch < 0)
+ if (ch < 0)
{
- /* !!#### indicate an error */
- tempch = '~';
+ if (little_endian)
+ {
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ ignore_bom);
+ }
+ else
+ {
+ DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+ ignore_bom);
+ DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+ ignore_bom);
+ }
}
- decode_unicode_char (tempch, dst, data, ignore_bom);
+ else
+ {
+ decode_unicode_char (ch, dst, data, ignore_bom);
+ }
+ ch = 0;
+ counter = 0;
}
break;
@@ -2032,10 +2172,14 @@
}
}
- if (str->eof)
- DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+ if (str->eof && ch)
+ {
+ DECODE_ERROR_OCTET (ch, dst, data, ignore_bom);
+ ch = 0;
+ }
data->counter = counter;
+ data->indicated_length = indicated_length;
}
else
{
@@ -2054,7 +2198,7 @@
if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) &&
!data->wrote_bom)
{
- encode_unicode_char_1 (0xFEFF, dst, type, little_endian);
+ encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1);
data->wrote_bom = 1;
}
@@ -2068,7 +2212,7 @@
{ /* Processing ASCII character */
ch = 0;
encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
- little_endian);
+ little_endian, 1);
char_boundary = 1;
}
@@ -2092,20 +2236,20 @@
for the rationale behind subtracting #xa0 from the
character's code. */
encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst,
- type, little_endian);
+ type, little_endian, 1);
else
{
switch (XCHARSET_REP_BYTES (charset))
{
case 2:
encode_unicode_char (charset, c, 0, dst, type,
- little_endian);
+ little_endian, 1);
break;
case 3:
if (XCHARSET_PRIVATE_P (charset))
{
encode_unicode_char (charset, c, 0, dst, type,
- little_endian);
+ little_endian, 1);
ch = 0;
}
else if (ch)
@@ -2119,7 +2263,7 @@
handle this yet. */
encode_unicode_char (Vcharset_ascii, '~', 0,
dst, type,
- little_endian);
+ little_endian, 1);
}
else
{
@@ -2138,7 +2282,7 @@
else
#endif /* ENABLE_COMPOSITE_CHARS */
encode_unicode_char (charset, ch, c, dst, type,
- little_endian);
+ little_endian, 1);
ch = 0;
}
else
@@ -2151,7 +2295,7 @@
if (ch)
{
encode_unicode_char (charset, ch, c, dst, type,
- little_endian);
+ little_endian, 1);
ch = 0;
}
else
--
On the quay of the little Black Sea port, where the rescued pair came once
more into contact with civilization, Dobrinton was bitten by a dog which was
assumed to be mad, though it may only have been indiscriminating. (Saki)
_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://calypso.tux.org/cgi-bin/mailman/listinfo/xemacs-patches