[PATCH] Handle UTF-8 more robustly; pass through information about incorrect sequences

Saturday, 21 July 2007

I’m running with this code, and it--combined with the following Lisp--makes
IRCing on a channel where most of the traffic is in UTF-8, but some people
occasionally send ISO 8859-1 much more comfortable.

I’m not certain about the sanity of the UTF-16 part of this, mind. More work
to be done. 

(loop for i from #x00 to #xFF 
  with glyph = nil 
  with char-table = (specifier-instance current-display-table) 
  do (setq glyph (make-glyph (vector 'string :data (string i)))) 
  (set-glyph-face glyph 'red) 
  (put-char-table (decode-char 'ucs (+ #x200000 i)) glyph char-table)) 

src/ChangeLog addition:

2007-07-21  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* charset.h:
	* mule-coding.c:
	* mule-coding.c (dynarr_add_2022_one_dimension):
	* mule-coding.c (dynarr_add_2022_two_dimensions):
	* mule-coding.c (struct iso2022_coding_stream):
	* mule-coding.c (decode_unicode_char):
	* mule-coding.c (indicate_invalid_utf_8):
	* mule-coding.c (iso2022_decode):
	* unicode.c:
	* unicode.c (struct unicode_coding_stream):
	* unicode.c (decode_unicode_char):
	* unicode.c (DECODE_ERROR_OCTET):
	* unicode.c (indicate_invalid_utf_8):
	* unicode.c (encode_unicode_char_1):
	* unicode.c (encode_unicode_char):
	* unicode.c (unicode_convert):
	Make UTF-8 handling more robust; indicate error sequences when
	decoding, passing the octets as distinct from the corresponding
	ISO8859-1 characters, and (by default) writing them to disk on
	encoding. Don't accept UTF-8 sequences longer than four octets on
	reading in the utf-8 coding system; do accept them in the ISO IR 196
	handling, since we decode Unicode error sequences to "Unicode" code
	points starting at 0x200000, and will need to save them as such
        in escape-quoted. 

	This change means that when a non-UTF-8 file is opened as UTF-8,
	one change made, and immediately saved, the non-ASCII characters
	are not corrupted. In Europe, this is a distinct win. 

	Don't error on invalid UTF-16 sequences; pass them through, using
	the same error octets. 

XEmacs Trunk source patch:
Diff command:   cvs -q diff -Nu
Files affected: src/unicode.c
===================================================================
RCS src/mule-coding.c
===================================================================
RCS src/charset.h
===================================================================
RCS

Index: src/charset.h
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/charset.h,v
retrieving revision 1.16
diff -u -u -r1.16 charset.h
--- src/charset.h	2006/11/12 13:40:07	1.16
+++ src/charset.h	2007/07/21 15:03:21
＠＠ -572,7 +572,10 ＠＠

 void encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 			  int USED_IF_MULE (l), unsigned_char_dynarr *dst,
-			  enum unicode_type type, unsigned int little_endian);
+			  enum unicode_type type, unsigned int little_endian,
+                          int write_error_characters_as_such);
+
+#define UNICODE_ERROR_OCTET_RANGE_START 0x200000

 void set_charset_registries(Lisp_Object charset, Lisp_Object registries);

Index: src/mule-coding.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/mule-coding.c,v
retrieving revision 1.39
diff -u -u -r1.39 mule-coding.c
--- src/mule-coding.c	2006/11/23 13:43:19	1.39
+++ src/mule-coding.c	2007/07/21 15:04:59
＠＠ -104,7 +104,7 ＠＠
   if (XCHARSET_ENCODE_AS_UTF_8 (charset)) 
     {
       encode_unicode_char (charset, c & charmask, 0,	
-			   dst, UNICODE_UTF_8, 0);		
+			   dst, UNICODE_UTF_8, 0, 0); 
     } 
   else							
     {							
＠＠ -123,7 +123,7 ＠＠
       encode_unicode_char (charset,				
 			   ch & charmask,			
 			   c & charmask, dst,		
-			   UNICODE_UTF_8, 0);		
+			   UNICODE_UTF_8, 0, 0); 
     }							
   else							
     {							
＠＠ -969,6 +969,7 ＠＠

   /* Used for handling UTF-8. */
   unsigned char counter;  
+  unsigned char indicated_length;
 };

 static const struct memory_description ccs_description_1[] =
＠＠ -1804,6 +1805,39 ＠＠
     }
 }

+/* Note that this name conflicts with a function in unicode.c. */
+static void
+decode_unicode_char (int ucs, unsigned_char_dynarr *dst)
+{
+  Ibyte work[MAX_ICHAR_LEN];
+  int len;
+  Lisp_Object chr;
+
+  chr = Funicode_to_char(make_int(ucs), Qnil);
+  assert (!NILP(chr));
+  len = set_itext_ichar (work, XCHAR(chr));
+  Dynarr_add_many (dst, work, len);
+}
+
+#define DECODE_ERROR_OCTET(octet, dst) \
+  decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, dst)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+                        unsigned char counter,
+                        int ch, unsigned_char_dynarr *dst)
+{
+  Binbyte stored = indicated_length - counter; 
+  Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+  while (stored > 0)
+    {
+      DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+                          dst);
+      mask = 0x80, stored--;
+    }
+}
+
 /* Convert ISO2022-format data to internal format. */

 static Bytecount
＠＠ -1907,9 +1941,7 ＠＠
       else if (flags & ISO_STATE_UTF_8)
 	{
 	  unsigned char counter = data->counter; 
-	  Ibyte work[MAX_ICHAR_LEN];
-	  int len;
-	  Lisp_Object chr;
+          unsigned char indicated_length = data->indicated_length;

 	  if (ISO_CODE_ESC == c)
 	    {
＠＠ -1919,74 +1951,123 ＠＠
 	      data->esc_bytes_index = 1;
 	      continue;
 	    }
-
-	  switch (counter)
-	    {
-	    case 0:
-	      if (c >= 0xfc)
-		{
-		  ch = c & 0x01;
-		  counter = 5;
-		}
-	      else if (c >= 0xf8)
-		{
-		  ch = c & 0x03;
-		  counter = 4;
-		}
-	      else if (c >= 0xf0)
-		{
-		  ch = c & 0x07;
-		  counter = 3;
-		}
-	      else if (c >= 0xe0)
-		{
-		  ch = c & 0x0f;
-		  counter = 2;
-		}
-	      else if (c >= 0xc0)
-		{
-		  ch = c & 0x1f;
-		  counter = 1;
-		}
-	      else
-		/* ASCII, or the lower control characters.
-                   
-                   Perhaps we should signal an error if the character is in
-                   the range 0x80-0xc0; this is illegal UTF-8. */
-                Dynarr_add (dst, (c & 0x7f));
-
-	      break;
-	    case 1:
-	      ch = (ch << 6) | (c & 0x3f);
-	      chr = Funicode_to_char(make_int(ch), Qnil);			
-
-	      if (!NILP (chr))						
-		{								
-		  assert(CHARP(chr));					
-		  len = set_itext_ichar (work, XCHAR(chr));		
-		  Dynarr_add_many (dst, work, len);			
-		}								
-	      else							
-		{								
-		  /* Shouldn't happen, this code should only be enabled in
-		     XEmacsen with support for all of Unicode. */
-		  Dynarr_add (dst, LEADING_BYTE_JAPANESE_JISX0208);	
-		  Dynarr_add (dst, 34 + 128);				
-		  Dynarr_add (dst, 46 + 128);				
-		}								
-
-	      ch = 0;
-	      counter = 0;
-	      break;
-	    default:
-	      ch = (ch << 6) | (c & 0x3f);
-	      counter--;
-	    }

-	  if (str->eof)
-	    DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+          if (0 == counter)
+            {
+              if (0 == (c & 0x80))
+                {
+                  /* ASCII. */
+                  decode_unicode_char (c, dst);
+                }
+              else if (0 == (c & 0x40))
+                {
+                  /* Highest bit set, second highest not--there's
+                     something wrong. */
+                  DECODE_ERROR_OCTET (c, dst);
+                }
+              else if (0 == (c & 0x20))
+                {
+                  ch = c & 0x1f; 
+                  counter = 1;
+                  indicated_length = 2;
+                }
+              else if (0 == (c & 0x10))
+                {
+                  ch = c & 0x0f;
+                  counter = 2;
+                  indicated_length = 3;
+                }
+              else if (0 == (c & 0x08))
+                {
+                  ch = c & 0x0f;
+                  counter = 3;
+                  indicated_length = 4;
+                }
+              /* We support lengths longer than 4 here, since we want to
+                 represent UTF-8 error chars as distinct from the
+                 corresponding ISO 8859-1 characters in escape-quoted.
+
+                 However, we can't differentiate UTF-8 error chars as
+                 written to disk, and UTF-8 errors in escape-quoted.  This
+                 is not a big problem;
+                 non-Unicode-chars-encoded-as-UTF-8-in-ISO-2022 is not
+                 deployed, in practice, so if such a sequence of octets
+                 occurs, XEmacs generated it.  */
+              else if (0 == (c & 0x04))
+                {
+                  ch = c & 0x03;
+                  counter = 4;
+                  indicated_length = 5;
+                }
+              else if (0 == (c & 0x02))
+                {
+                  ch = c & 0x01;
+                  counter = 5;
+                  indicated_length = 6;
+                }
+              else
+                {
+                  /* #xFF is not a valid leading byte in any form of
+                     UTF-8. */
+                  DECODE_ERROR_OCTET (c, dst);
+
+                }
+            }
+          else
+            {
+              /* counter != 0 */
+              if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+                {
+                  indicate_invalid_utf_8(indicated_length, 
+                                         counter, 
+                                         ch, dst);
+                  if (c & 0x80)
+                    {
+                      DECODE_ERROR_OCTET (c, dst);
+                    }
+                  else
+                    {
+                      /* The character just read is ASCII. Treat it as
+                         such.  */
+                      decode_unicode_char (c, dst);
+                    }
+                  ch = 0;
+                  counter = 0;
+                }
+              else 
+                {
+                  ch = (ch << 6) | (c & 0x3f);
+                  counter--;
+                  /* Just processed the final byte. Emit the character,
+                     avoiding over-long sequences. */
+                  if (!counter)
+                    {
+                      if ((ch < 0x80) ||
+                          ((ch < 0x800) && indicated_length > 2) || 
+                          ((ch < 0x1000) && indicated_length > 3) || 
+                          ((ch < 0x10000) && indicated_length > 4))
+                        {
+                          indicate_invalid_utf_8(indicated_length, 
+                                                 counter, 
+                                                 ch, dst);
+                        }
+                      else
+                        {
+                          decode_unicode_char (ch, dst);
+                        }
+                      ch = 0;
+                    }
+                }
+            }
+
+          if (str->eof && ch)
+            {
+              DECODE_ERROR_OCTET (ch, dst);
+              ch  = 0;
+            }

 	  data->counter = counter;
+	  data->indicated_length = indicated_length;
 	}
       else if (byte_c0_p (c) || byte_c1_p (c))
 	{ /* Control characters */
Index: src/unicode.c
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/src/unicode.c,v
retrieving revision 1.37
diff -u -u -r1.37 unicode.c
--- src/unicode.c	2007/05/13 11:11:30	1.37
+++ src/unicode.c	2007/07/21 15:05:45
＠＠ -146,13 +146,6 ＠＠
    (1) User-defined charsets: It would be inconvenient to require all
    dumped user-defined charsets to be reloaded at init time.

-   (2) Starting up in a non-ISO-8859-1 directory.  If we load at run-time,
-   we don't load the tables until after we've parsed the current
-   directories, and we run into a real bootstrapping problem, if the
-   directories themselves are non-ISO-8859-1.  This is potentially fixable
-   once we switch to using Unicode internally, so we don't have to do any
-   conversion (other than the automatic kind, e.g. UTF-16 to UTF-8).
-
    NB With run-time loading, we load in init-mule-at-startup, in
    mule-cmds.el.  This is called from startup.el, which is quite late in
    the initialization process -- but data-directory isn't set until then.
＠＠ -1703,6 +1696,7 ＠＠
 {
   /* decode */
   unsigned char counter;
+  unsigned char indicated_length;
   int seen_char;
   /* encode */
   Lisp_Object current_charset;
＠＠ -1716,11 +1710,6 ＠＠

 DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (unicode);

-/* Decode a UCS-2 or UCS-4 character into a buffer.  If the lookup fails, use
-   <GETA MARK> (U+3013) of JIS X 0208, which means correct character
-   is not found, instead.
-   #### do something more appropriate (use blob?)
-        Danger, Will Robinson!  Data loss.  Should we signal user? */
 static void
 decode_unicode_char (int ch, unsigned_char_dynarr *dst,
 		     struct unicode_coding_stream *data,
＠＠ -1755,9 +1744,32 ＠＠
   data->seen_char = 1;
 }

+#define DECODE_ERROR_OCTET(octet, dst, data, ignore_bom) \
+  decode_unicode_char ((octet) + UNICODE_ERROR_OCTET_RANGE_START, \
+                       dst, data, ignore_bom)
+
+static inline void
+indicate_invalid_utf_8 (unsigned char indicated_length,
+                        unsigned char counter,
+                        int ch, unsigned_char_dynarr *dst,
+                        struct unicode_coding_stream *data,
+                        unsigned int ignore_bom)
+{
+  Binbyte stored = indicated_length - counter; 
+  Binbyte mask = "\x00\x00\xC0\xE0\xF0\xF8\xFC"[indicated_length];
+
+  while (stored > 0)
+    {
+      DECODE_ERROR_OCTET (((ch >> (6 * (stored - 1))) & 0x3f) | mask,
+                        dst, data, ignore_bom);
+      mask = 0x80, stored--;
+    }
+}
+
 static void
 encode_unicode_char_1 (int code, unsigned_char_dynarr *dst,
-		       enum unicode_type type, unsigned int little_endian)
+		       enum unicode_type type, unsigned int little_endian,
+                       int write_error_characters_as_such)
 {
   switch (type)
     {
＠＠ -1768,16 +1780,25 ＠＠
 	    Dynarr_add (dst, (unsigned char) (code & 255));
 	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
 	  } else {
-	    /* Little endian; least significant byte first. */
-	    int first, second;
-
-	    CODE_TO_UTF_16_SURROGATES(code, first, second);
-
-	    Dynarr_add (dst, (unsigned char) (first & 255));
-	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
-
-	    Dynarr_add (dst, (unsigned char) (second & 255));
-	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+            if (write_error_characters_as_such && 
+                code >= UNICODE_ERROR_OCTET_RANGE_START &&
+                code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+              {
+                Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+              }
+            else
+              {
+                /* Little endian; least significant byte first. */
+                int first, second;
+
+                CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+                Dynarr_add (dst, (unsigned char) (first & 255));
+                Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+
+                Dynarr_add (dst, (unsigned char) (second & 255));
+                Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+              }
 	  }
 	}
       else
＠＠ -1786,16 +1807,25 ＠＠
 	    Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
 	    Dynarr_add (dst, (unsigned char) (code & 255));
 	  } else {
-	    /* Big endian; most significant byte first. */
-	    int first, second;
-
-	    CODE_TO_UTF_16_SURROGATES(code, first, second);
-
-	    Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
-	    Dynarr_add (dst, (unsigned char) (first & 255));
-
-	    Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
-	    Dynarr_add (dst, (unsigned char) (second & 255));
+            if (write_error_characters_as_such && 
+                code >= UNICODE_ERROR_OCTET_RANGE_START &&
+                code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+              {
+                Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+              }
+            else
+              {
+                /* Big endian; most significant byte first. */
+                int first, second;
+
+                CODE_TO_UTF_16_SURROGATES(code, first, second);
+
+                Dynarr_add (dst, (unsigned char) ((first >> 8) & 255));
+                Dynarr_add (dst, (unsigned char) (first & 255));
+
+                Dynarr_add (dst, (unsigned char) ((second >> 8) & 255));
+                Dynarr_add (dst, (unsigned char) (second & 255));
+              }
 	  }
 	}
       break;
＠＠ -1803,17 +1833,35 ＠＠
     case UNICODE_UCS_4:
       if (little_endian)
 	{
-	  Dynarr_add (dst, (unsigned char) (code & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
-	  Dynarr_add (dst, (unsigned char) (code >> 24));
+          if (write_error_characters_as_such && 
+              code >= UNICODE_ERROR_OCTET_RANGE_START &&
+              code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else
+            {
+              Dynarr_add (dst, (unsigned char) (code & 255));
+              Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+              Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+              Dynarr_add (dst, (unsigned char) (code >> 24));
+            }
 	}
       else
 	{
-	  Dynarr_add (dst, (unsigned char) (code >> 24));
-	  Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
-	  Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
-	  Dynarr_add (dst, (unsigned char) (code & 255));
+          if (write_error_characters_as_such && 
+              code >= UNICODE_ERROR_OCTET_RANGE_START &&
+              code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else
+            {
+              Dynarr_add (dst, (unsigned char) (code >> 24));
+              Dynarr_add (dst, (unsigned char) ((code >> 16) & 255));
+              Dynarr_add (dst, (unsigned char) ((code >> 8) & 255));
+              Dynarr_add (dst, (unsigned char) (code & 255));
+            }
 	}
       break;

＠＠ -1842,11 +1890,25 ＠＠
 	}
       else if (code <= 0x3ffffff)
 	{
-	  Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
-	  Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) | 0x80));
-	  Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) | 0x80));
-	  Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) | 0x80));
-	  Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
+
+#if !(UNICODE_ERROR_OCTET_RANGE_START > 0x1fffff \
+          && UNICODE_ERROR_OCTET_RANGE_START < 0x3ffffff)
+#error "This code needs to be rewritten. " 
+#endif
+          if (write_error_characters_as_such && 
+              code >= UNICODE_ERROR_OCTET_RANGE_START &&
+              code < (UNICODE_ERROR_OCTET_RANGE_START + 0x100))
+            {
+              Dynarr_add (dst, (unsigned char) ((code & 0xFF)));
+            }
+          else 
+            {
+              Dynarr_add (dst, (unsigned char) ((code >> 24) | 0xf8));
+              Dynarr_add (dst, (unsigned char) (((code >> 18) & 0x3f) |
0x80));
+              Dynarr_add (dst, (unsigned char) (((code >> 12) & 0x3f) |
0x80));
+              Dynarr_add (dst, (unsigned char) (((code >>  6) & 0x3f) |
0x80));
+              Dynarr_add (dst, (unsigned char) ((code        & 0x3f) | 0x80));
+            }
 	}
       else
 	{
＠＠ -1870,7 +1932,8 ＠＠
 void
 encode_unicode_char (Lisp_Object USED_IF_MULE (charset), int h,
 		     int USED_IF_MULE (l), unsigned_char_dynarr *dst,
-		     enum unicode_type type, unsigned int little_endian)
+		     enum unicode_type type, unsigned int little_endian,
+                     int write_error_characters_as_such)
 {
 #ifdef MULE
   int code = ichar_to_unicode (make_ichar (charset, h & 127, l & 127));
＠＠ -1896,7 +1959,8 ＠＠
   int code = h;
 #endif /* MULE */

-  encode_unicode_char_1 (code, dst, type, little_endian);
+  encode_unicode_char_1 (code, dst, type, little_endian, 
+                         write_error_characters_as_such);
 }

 static Bytecount
＠＠ -1915,6 +1979,8 ＠＠
   if (str->direction == CODING_DECODE)
     {
       unsigned char counter = data->counter;
+      unsigned char indicated_length
+        = data->indicated_length;

       while (n--)
 	{
＠＠ -1923,46 +1989,91 ＠＠
 	  switch (type)
 	    {
 	    case UNICODE_UTF_8:
-	      switch (counter)
-		{
-		case 0:
-		  if (c >= 0xfc)
-		    {
-		      ch = c & 0x01;
-		      counter = 5;
-		    }
-		  else if (c >= 0xf8)
-		    {
-		      ch = c & 0x03;
-		      counter = 4;
-		    }
-		  else if (c >= 0xf0)
-		    {
-		      ch = c & 0x07;
-		      counter = 3;
-		    }
-		  else if (c >= 0xe0)
-		    {
-		      ch = c & 0x0f;
-		      counter = 2;
-		    }
-		  else if (c >= 0xc0)
-		    {
-		      ch = c & 0x1f;
-		      counter = 1;
-		    }
-		  else
-		    decode_unicode_char (c, dst, data, ignore_bom);
-		  break;
-		case 1:
-		  ch = (ch << 6) | (c & 0x3f);
-		  decode_unicode_char (ch, dst, data, ignore_bom);
-		  ch = 0;
-		  counter = 0;
-		  break;
-		default:
-		  ch = (ch << 6) | (c & 0x3f);
-		  counter--;
+              if (0 == counter)
+                {
+                  if (0 == (c & 0x80))
+                    {
+                      /* ASCII. */
+                      decode_unicode_char (c, dst, data, ignore_bom);
+                    }
+                  else if (0 == (c & 0x40))
+                    {
+                      /* Highest bit set, second highest not--there's
+                         something wrong. */
+                      DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                    }
+                  else if (0 == (c & 0x20))
+                    {
+                      ch = c & 0x1f; 
+                      counter = 1;
+                      indicated_length = 2;
+                    }
+                  else if (0 == (c & 0x10))
+                    {
+                      ch = c & 0x0f;
+                      counter = 2;
+                      indicated_length = 3;
+                    }
+                  else if (0 == (c & 0x08))
+                    {
+                      ch = c & 0x0f;
+                      counter = 3;
+                      indicated_length = 4;
+                    }
+                  else
+                    {
+                      /* We don't supports lengths longer than 4 in
+                         external-format data. */
+                      DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+
+                    }
+                }
+              else
+                {
+                  /* counter != 0 */
+                  if ((0 == (c & 0x80)) || (0 != (c & 0x40)))
+                    {
+                      indicate_invalid_utf_8(indicated_length, 
+                                             counter, 
+                                             ch, dst, data, ignore_bom);
+                      if (c & 0x80)
+                        {
+                          DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                        }
+                      else
+                        {
+                          /* The character just read is ASCII. Treat it as
+                             such.  */
+                          decode_unicode_char (c, dst, data, ignore_bom);
+                        }
+                      ch = 0;
+                      counter = 0;
+                    }
+                  else 
+                    {
+                      ch = (ch << 6) | (c & 0x3f);
+                      counter--;
+                      /* Just processed the final byte. Emit the character,
+                         avoiding over-long sequences. */
+                      if (!counter)
+                        {
+                          if ((ch < 0x80) ||
+                              ((ch < 0x800) && indicated_length > 2) || 
+                              ((ch < 0x1000) && indicated_length > 3) || 
+                              ((ch < 0x10000) && indicated_length > 4))
+                            {
+                              indicate_invalid_utf_8(indicated_length, 
+                                                     counter, 
+                                                     ch, dst, data,
+                                                     ignore_bom);
+                            }
+                          else
+                            {
+                              decode_unicode_char (ch, dst, data, ignore_bom);
+                            }
+                          ch = 0;
+                        }
+                    }
 		}
 	      break;

＠＠ -1987,20 +2098,27 ＠＠
 	      if (counter == 32)
 		{
 		  int tempch;
-		  /* #### Signalling an error may be a bit extreme. Should
-		     we try and read it in anyway? */
+
 		  if (!valid_utf_16_first_surrogate(ch >> 16) 
 		      || !valid_utf_16_last_surrogate(ch & 0xFFFF))
 		    {
-		      signal_error(Qtext_conversion_error, 
-				   "Invalid UTF-16 surrogate sequence", 
-				   Qunbound);
+                      DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                        ignore_bom);
+                      DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                        ignore_bom);
+                      DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                        ignore_bom);
+                      DECODE_ERROR_OCTET (ch & 0xFF, dst, data,
+                                        ignore_bom);
 		    }
-		  tempch = utf_16_surrogates_to_code((ch >> 16), 
-						     (ch & 0xffff));
+                  else 
+                    {
+                      tempch = utf_16_surrogates_to_code((ch >> 16), 
+                                                         (ch & 0xffff));
+                      decode_unicode_char(tempch, dst, data, ignore_bom);
+                    }
 		  ch = 0;
 		  counter = 0;
-		  decode_unicode_char(tempch, dst, data, ignore_bom);
 		}
 	      break;

＠＠ -2012,15 +2130,37 ＠＠
 	      counter += 8;
 	      if (counter == 32)
 		{
-		  int tempch = ch;
-		  ch = 0;
-		  counter = 0;
-		  if (tempch < 0)
+		  if (ch < 0)
 		    {
-		      /* !!#### indicate an error */
-		      tempch = '~';
+                      if (little_endian)
+                        {
+                          DECODE_ERROR_OCTET (ch & 0xFF, dst, data, 
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                            ignore_bom);
+                        }
+                      else
+                        {
+                          DECODE_ERROR_OCTET ((ch >> 24) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 16) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET ((ch >> 8) & 0xFF, dst, data,
+                                            ignore_bom);
+                          DECODE_ERROR_OCTET (ch & 0xFF, dst, data, 
+                                            ignore_bom);
+                        }
 		    }
-		  decode_unicode_char (tempch, dst, data, ignore_bom);
+                  else
+                    {
+                      decode_unicode_char (ch, dst, data, ignore_bom);
+                    }
+		  ch = 0;
+		  counter = 0;
 		}
 	      break;

＠＠ -2032,10 +2172,14 ＠＠
 	    }

 	}
-      if (str->eof)
-	DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
+      if (str->eof && ch)
+        {
+          DECODE_ERROR_OCTET (ch, dst, data, ignore_bom);
+          ch  = 0;
+        }

       data->counter = counter;
+      data->indicated_length = indicated_length;
     }
   else
     {
＠＠ -2054,7 +2198,7 ＠＠

       if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) &&
!data->wrote_bom)
 	{
-	  encode_unicode_char_1 (0xFEFF, dst, type, little_endian);
+	  encode_unicode_char_1 (0xFEFF, dst, type, little_endian, 1);
 	  data->wrote_bom = 1;
 	}

＠＠ -2068,7 +2212,7 ＠＠
 	    {			/* Processing ASCII character */
 	      ch = 0;
 	      encode_unicode_char (Vcharset_ascii, c, 0, dst, type,
-				   little_endian);
+				   little_endian, 1);

 	      char_boundary = 1;
 	    }
＠＠ -2092,20 +2236,20 ＠＠
 		   for the rationale behind subtracting #xa0 from the
 		   character's code. */
 		encode_unicode_char (Vcharset_control_1, c - 0xa0, 0, dst,
-				     type, little_endian);
+				     type, little_endian, 1);
 	      else
 		{
 		  switch (XCHARSET_REP_BYTES (charset))
 		    {
 		    case 2:
 		      encode_unicode_char (charset, c, 0, dst, type,
-					   little_endian);
+					   little_endian, 1);
 		      break;
 		    case 3:
 		      if (XCHARSET_PRIVATE_P (charset))
 			{
 			  encode_unicode_char (charset, c, 0, dst, type,
-					       little_endian);
+					       little_endian, 1);
 			  ch = 0;
 			}
 		      else if (ch)
＠＠ -2119,7 +2263,7 ＠＠
 				     handle this yet. */
 				  encode_unicode_char (Vcharset_ascii, '~', 0,
 						       dst, type,
-						       little_endian);
+						       little_endian, 1);
 				}
 			      else
 				{
＠＠ -2138,7 +2282,7 ＠＠
 			  else
 #endif /* ENABLE_COMPOSITE_CHARS */
 			    encode_unicode_char (charset, ch, c, dst, type,
-						 little_endian);
+						 little_endian, 1);
 			  ch = 0;
 			}
 		      else
＠＠ -2151,7 +2295,7 ＠＠
 		      if (ch)
 			{
 			  encode_unicode_char (charset, ch, c, dst, type,
-					       little_endian);
+					       little_endian, 1);
 			  ch = 0;
 			}
 		      else

-- 
On the quay of the little Black Sea port, where the rescued pair came once
more into contact with civilization, Dobrinton was bitten by a dog which was
assumed to be mad, though it may only have been indiscriminating. (Saki)

_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://calypso.tux.org/cgi-bin/mailman/listinfo/xemacs-patches

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

[PATCH] Handle UTF-8 more robustly; pass through information about incorrect sequences