[XEMACS PATCH] Have coding systems do some bytecount->charcount work

Thursday, 19 December 2013

With large non-conversion-unix files (in particular, VM buffers), 10% of the
time needed to read them in is spent in bytecount_to_charcount_func(),
working out the byte-character correspondence for the buffer code.

The coding systems in general know exactly where the character boundaries
are, though, and if they record it there’s no need for the buffer insertion
code to do that work. The below patch gives noticeably snappier performance
for me loading large files with reasonable amounts of non-ASCII characters. 
It is very very much not ready to commit, I post it to show the idea and
because I don’t anticipate I’ll get to finishing it this month.

diff -r 94a6b8fbd56e src/file-coding.c
--- a/src/file-coding.c	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/file-coding.c	Thu Dec 19 10:47:01 2013 +0000
＠＠ -1990,6 +1990,14 ＠＠
   return Lstream_seekable_p (str->other_end);
 }

+static Charcount
+coding_character_tell (Lstream *stream)
+{
+  struct coding_stream *str = CODING_STREAM_DATA (stream);
+
+  return XCODESYSMETH_OR_GIVEN (str->codesys, character_tell, (str), -1);
+}
+
 static int
 coding_flusher (Lstream *stream)
 {
＠＠ -2823,7 +2831,31 ＠＠

    #### Shouldn't we _call_ it that, then?  And while we're at it,
    separate it into "to_internal" and "to_external"? */
-DEFINE_CODING_SYSTEM_TYPE (no_conversion);
+
+
+struct no_conversion_coding_system
+{
+};
+
+struct no_conversion_coding_stream
+{
+  Charcount characters_seen;
+};
+
+static const struct memory_description no_conversion_coding_system_description[] = {
+  { XD_END }
+};
+
+static const struct memory_description no_conversion_coding_stream_description_1 [] = {
+  { XD_INT, offsetof (struct no_conversion_coding_stream, characters_seen) },
+  { XD_END }
+};
+
+const struct sized_memory_description no_conversion_coding_stream_description = {
+  sizeof (struct no_conversion_coding_stream), no_conversion_coding_stream_description_1
+};
+
+DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion);

 /* This is used when reading in "binary" files -- i.e. files that may
    contain all 256 possible byte values and that are not to be
＠＠ -2846,12 +2878,14 ＠＠
 	  DECODE_ADD_BINARY_CHAR (c, dst);
 	}

+      CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen
+        += orign;
+
       if (str->eof)
 	DECODE_OUTPUT_PARTIAL_CHAR (ch, dst);
     }
   else
     {
-
       while (n--)
 	{
 	  c = *src++;
＠＠ -2893,6 +2927,13 ＠＠
   return orign;
 }

+static Charcount
+no_conversion_character_tell (struct coding_stream *str)
+{
+#warning "examine the coding character mode too"n
+  return CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen;
+}
+
 DEFINE_DETECTOR (no_conversion);
 DEFINE_DETECTOR_CATEGORY (no_conversion, no_conversion);

＠＠ -4645,6 +4686,7 ＠＠
   LSTREAM_HAS_METHOD (coding, writer);
   LSTREAM_HAS_METHOD (coding, rewinder);
   LSTREAM_HAS_METHOD (coding, seekable_p);
+  LSTREAM_HAS_METHOD (coding, character_tell);
   LSTREAM_HAS_METHOD (coding, marker);
   LSTREAM_HAS_METHOD (coding, flusher);
   LSTREAM_HAS_METHOD (coding, closer);
＠＠ -4686,9 +4728,10 ＠＠
   dump_add_opaque_int (&coding_detector_count);
   dump_add_opaque_int (&coding_detector_category_count);

-  INITIALIZE_CODING_SYSTEM_TYPE (no_conversion,
-				 "no-conversion-coding-system-p");
+  INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion,
+                                           "no-conversion-coding-system-p");
   CODING_SYSTEM_HAS_METHOD (no_conversion, convert);
+  CODING_SYSTEM_HAS_METHOD (no_conversion, character_tell);

   INITIALIZE_DETECTOR (no_conversion);
   DETECTOR_HAS_METHOD (no_conversion, detect);
diff -r 94a6b8fbd56e src/file-coding.h
--- a/src/file-coding.h	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/file-coding.h	Thu Dec 19 10:47:01 2013 +0000
＠＠ -353,6 +353,9 ＠＠
      a result of the stream being rewound.  Optional. */
   void (*rewind_coding_stream_method) (struct coding_stream *str);

+  /* Return the number of characters processed. Optional. */
+  Charcount (*character_tell_method) (struct coding_stream *str);
+
   /* Finalize coding stream method: Clean up the type-specific data
      attached to the coding stream (i.e. in struct TYPE_coding_stream).
      Happens when the Lstream is deleted using Lstream_delete() or is
＠＠ -1109,4 +1112,3 ＠＠
 			    int given);

 #endif /* INCLUDED_file_coding_h_ */
-
diff -r 94a6b8fbd56e src/fileio.c
--- a/src/fileio.c	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/fileio.c	Thu Dec 19 10:47:01 2013 +0000
＠＠ -3196,7 +3196,8 ＠＠
     while (1)
       {
 	Bytecount this_len;
-	Charcount cc_inserted;
+	Charcount last_tell
+          = Lstream_character_tell (XLSTREAM (stream)), cc_inserted;

 	QUIT;
 	this_len = Lstream_read (XLSTREAM (stream), read_buf,
＠＠ -3209,10 +3210,13 ＠＠
 	    break;
 	  }

-	cc_inserted = buffer_insert_raw_string_1 (buf, cur_point, read_buf,
-						  this_len,
-						  !NILP (visit)
-						  ? INSDEL_NO_LOCKING : 0);
+	cc_inserted
+          = buffer_insert_string_1 (buf, cur_point, read_buf, Qnil,
+                                    0, this_len, last_tell > 0
+                                    ? Lstream_character_tell (XLSTREAM
+                                                              (stream))
+                                    - last_tell : -1,
+                                    !NILP (visit) ? INSDEL_NO_LOCKING : 0);
 	inserted  += cc_inserted;
 	cur_point += cc_inserted;
       }
diff -r 94a6b8fbd56e src/insdel.c
--- a/src/insdel.c	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/insdel.c	Thu Dec 19 10:47:01 2013 +0000
＠＠ -1061,13 +1061,12 ＠＠
 buffer_insert_string_1 (struct buffer *buf, Charbpos pos,
 			const Ibyte *nonreloc, Lisp_Object reloc,
 			Bytecount offset, Bytecount length,
-			int flags)
+                        Charcount cclen, int flags)
 {
   /* This function can GC */
   struct gcpro gcpro1;
   Bytebpos bytepos;
   Bytecount length_in_buffer;
-  Charcount cclen;
   int move_point = 0;
   struct buffer *mbuf;
   Lisp_Object bufcons;
＠＠ -1118,14 +1117,30 ＠＠

   bytepos = charbpos_to_bytebpos (buf, pos);

-  /* string may have been relocated up to this point */
-  if (STRINGP (reloc))
+  if (cclen < 0)
     {
-      cclen = string_offset_byte_to_char_len (reloc, offset, length);
-      nonreloc = XSTRING_DATA (reloc);
+      /* string may have been relocated up to this point */
+      if (STRINGP (reloc))
+        {
+          cclen = string_offset_byte_to_char_len (reloc, offset, length);
+          nonreloc = XSTRING_DATA (reloc);
+        }
+      else
+        cclen = bytecount_to_charcount (nonreloc + offset, length);
     }
+#ifdef ERROR_CHECK_TEXT
   else
-    cclen = bytecount_to_charcount (nonreloc + offset, length);
+    {
+      text_checking_assert (cclen
+                            == (STRINGP (reloc) ?
+                                string_offset_byte_to_char_len (reloc,
+                                                                offset, length)
+: bytecount_to_charcount (nonreloc + offset,
+                                                          length)));
+                                
+    }
+#endif
+
   /* &&#### Here we check if the text can't fit into the format of the
buffer,
      and if so convert it to another format (either default or 32-bit-fixed,
      according to some flag; if no flag, use default). */
＠＠ -1286,7 +1301,7 ＠＠
 {
   /* This function can GC */
   return buffer_insert_string_1 (buf, pos, nonreloc, Qnil, 0, length,
-				 flags);
+				 -1, flags);
 }

 Charcount
＠＠ -1295,8 +1310,7 ＠＠
 {
   /* This function can GC */
   return buffer_insert_string_1 (buf, pos, 0, str, 0,
-				 XSTRING_LENGTH (str),
-				 flags);
+				 XSTRING_LENGTH (str), -1, flags);
 }

 /* Insert the null-terminated string S (in external format). */
＠＠ -1309,7 +1323,7 ＠＠
   const CIbyte *translated = GETTEXT (s);
   ASSERT_ASCTEXT_ASCII (s);
   return buffer_insert_string_1 (buf, pos, (const Ibyte *) translated, Qnil,
-				 0, strlen (translated), flags);
+				 0, strlen (translated), -1, flags);
 }

 Charcount
＠＠ -1319,7 +1333,7 ＠＠
   /* This function can GC */
   Ibyte str[MAX_ICHAR_LEN];
   Bytecount len = set_itext_ichar (str, ch);
-  return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, flags);
+  return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, -1, flags);
 }

 Charcount
＠＠ -1339,7 +1353,7 ＠＠
   /* This function can GC */
   Lisp_Object str = make_string_from_buffer (buf2, pos2, length);
   return buffer_insert_string_1 (buf, pos, 0, str, 0,
-				 XSTRING_LENGTH (str), flags);
+				 XSTRING_LENGTH (str), -1, flags);
 }

＠＠ -1674,7 +1688,7 ＠＠
        * backward so that it now equals the insertion point.
        */
       buffer_insert_string_1 (buf, (movepoint ? -1 : pos),
-			      newstr, Qnil, 0, newlen, 0);
+			      newstr, Qnil, 0, newlen, -1, 0);
     }
 }

diff -r 94a6b8fbd56e src/insdel.h
--- a/src/insdel.h	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/insdel.h	Thu Dec 19 10:47:01 2013 +0000
＠＠ -38,7 +38,7 ＠＠
 Charcount buffer_insert_string_1 (struct buffer *buf, Charbpos pos,
 				  const Ibyte *nonreloc, Lisp_Object reloc,
 				  Bytecount offset, Bytecount length,
-				  int flags);
+				  Charcount clen, int flags);
 Charcount buffer_insert_raw_string_1 (struct buffer *buf, Charbpos pos,
 				      const Ibyte *nonreloc,
 				      Bytecount length, int flags);
＠＠ -58,7 +58,7 ＠＠
    All of these can GC. */

 #define buffer_insert_string(buf, nonreloc, reloc, offset, length) \
-  buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, 0)
+  buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, -1, 0)
 #define buffer_insert_raw_string(buf, string, length) \
   buffer_insert_raw_string_1 (buf, -1, string, length, 0)
 #define buffer_insert_ascstring(buf, s) \
diff -r 94a6b8fbd56e src/lstream.c
--- a/src/lstream.c	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/lstream.c	Thu Dec 19 10:47:01 2013 +0000
＠＠ -735,6 +735,11 ＠＠
   return Lstream_read_1 (lstr, data, size, 0);
 }

+Charcount
+Lstream_character_tell (Lstream *lstr)
+{
+  return lstr->imp->character_tell ? lstr->imp->character_tell (lstr) : -1;
+}

 /* Push back SIZE bytes of DATA onto the input queue.  The next call
    to Lstream_read() with the same size will read the same bytes back.
diff -r 94a6b8fbd56e src/lstream.h
--- a/src/lstream.h	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/lstream.h	Thu Dec 19 10:47:01 2013 +0000
＠＠ -181,6 +181,8 ＠＠
      method.  If this method is not present, the result is determined
      by whether a rewind method is present. */
   int (*seekable_p) (Lstream *stream);
+
+  Charcount (*character_tell) (Lstream *stream);
   /* Perform any additional operations necessary to flush the
      data in this stream. */
   int (*flusher) (Lstream *stream);
＠＠ -297,8 +299,8 ＠＠
 int Lstream_fputc (Lstream *lstr, int c);
 int Lstream_fgetc (Lstream *lstr);
 void Lstream_fungetc (Lstream *lstr, int c);
-Bytecount Lstream_read (Lstream *lstr, void *data,
-				 Bytecount size);
+Bytecount Lstream_read (Lstream *lstr, void *data, Bytecount size);
+Charcount Lstream_character_tell (Lstream *);
 int Lstream_write (Lstream *lstr, const void *data,
 		   Bytecount size);
 int Lstream_was_blocked_p (Lstream *lstr);
diff -r 94a6b8fbd56e src/print.c
--- a/src/print.c	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/print.c	Thu Dec 19 10:47:01 2013 +0000
＠＠ -514,7 +514,7 ＠＠

       buffer_insert_string_1 (XMARKER (function)->buffer,
 			      spoint, nonreloc, reloc, offset, len,
-			      0);
+			      -1, 0);
       Fset_marker (function, make_fixnum (spoint + cclen),
 		   Fmarker_buffer (function));
     }
diff -r 94a6b8fbd56e src/unicode.c
--- a/src/unicode.c	Tue Dec 17 20:49:52 2013 +0200
+++ b/src/unicode.c	Thu Dec 19 10:47:01 2013 +0000
＠＠ -1707,6 +1707,7 ＠＠
   unsigned char counter;
   unsigned char indicated_length;
   int seen_char;
+  Charcount characters_seen;
   /* encode */
   Lisp_Object current_charset;
   int current_char_boundary;
＠＠ -1988,6 +1989,18 ＠＠
                          write_error_characters_as_such);
 }

+static Charcount
+unicode_character_tell (struct coding_stream *str)
+{
+#warning "examine the coding character mode too"n
+  if (CODING_STREAM_TYPE_DATA (str, unicode)->counter == 0)
+    {
+      return CODING_STREAM_TYPE_DATA (str, unicode)->characters_seen;
+    }
+
+  return -1;
+}
+
 static Bytecount
 unicode_convert (struct coding_stream *str, const UExtbyte *src,
 		 unsigned_char_dynarr *dst, Bytecount n)
＠＠ -2006,6 +2019,7 ＠＠
       unsigned char counter = data->counter;
       unsigned char indicated_length
         = data->indicated_length;
+      Charcount characters_seen = data->characters_seen;

       while (n--)
 	{
＠＠ -2020,12 +2034,15 ＠＠
                     {
                       /* ASCII. */
                       decode_unicode_char (c, dst, data, ignore_bom);
+                      characters_seen++;
                     }
                   else if (0 == (c & 0x40))
                     {
                       /* Highest bit set, second highest not--there's
                          something wrong. */
                       DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                      /* This is a character in the buffer. */
+                      characters_seen++;
                     }
                   else if (0 == (c & 0x20))
                     {
＠＠ -2050,7 +2067,7 ＠＠
                       /* We don't supports lengths longer than 4 in
                          external-format data. */
                       DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
-
+                      characters_seen++;
                     }
                 }
               else
＠＠ -2061,15 +2078,20 ＠＠
                       indicate_invalid_utf_8(indicated_length, 
                                              counter, 
                                              ch, dst, data, ignore_bom);
+                      /* These are characters our receiver will see, not
+                         actual characters we've seen in the input. */
+                      characters_seen += (indicated_length - counter);
                       if (c & 0x80)
                         {
                           DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                          characters_seen++;
                         }
                       else
                         {
                           /* The character just read is ASCII. Treat it as
                              such.  */
                           decode_unicode_char (c, dst, data, ignore_bom);
+                          characters_seen++;
                         }
                       ch = 0;
                       counter = 0;
＠＠ -2092,10 +2114,12 ＠＠
                                                      counter, 
                                                      ch, dst, data,
                                                      ignore_bom);
+                              characters_seen += (indicated_length - counter);
                             }
                           else
                             {
                               decode_unicode_char (ch, dst, data, ignore_bom);
+                              characters_seen++;
                             }
                           ch = 0;
                         }
＠＠ -2242,6 +2266,7 ＠＠
               indicate_invalid_utf_8(indicated_length, 
                                      counter, ch, dst, data, 
                                      ignore_bom);
+              characters_seen += (indicated_length - counter);
               break;

             case UNICODE_UTF_16:
＠＠ -2295,6 +2320,7 ＠＠

       data->counter = counter;
       data->indicated_length = indicated_length;
+      data->characters_seen = characters_seen;
     }
   else
     {
＠＠ -3177,6 +3203,8 ＠＠
   CODING_SYSTEM_HAS_METHOD (unicode, putprop);
   CODING_SYSTEM_HAS_METHOD (unicode, getprop);

+  CODING_SYSTEM_HAS_METHOD (unicode, character_tell);
+
   INITIALIZE_DETECTOR (utf_8);
   DETECTOR_HAS_METHOD (utf_8, detect);
   INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);

-- 
‘Liston operated so fast that he once accidentally amputated an assistant’s
fingers along with a patient’s leg, […] The patient and the assistant both
died of sepsis, and a spectator reportedly died of shock, resulting in the
only known procedure with a 300% mortality.’ (Atul Gawande, NEJM, 2012)

_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://lists.xemacs.org/mailman/listinfo/xemacs-patches

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

[XEMACS PATCH] Have coding systems do some bytecount->charcount work