stephen, i need to talk to you on the phone
that way i'll have a reasonable chance of answering your ?'s before my hands
fall off.
let me know your # and your available hours.
btw i've posted my implementation so far.
/* ------------------------------ */
/* (E) For working with Eistrings */
/* ------------------------------ */
/* Note: Unfortunately, we have to write most of the Eistring functions as
   macros, because of the use of alloca().  The principle used below to assure
   no conflict in local variables is to prefix all local variables with "ei"
   plus a number, which should be unique among macros.  In practice, when
   finding a new number, use one greater than all existing numbers. */
typedef struct
{
  void *data;
  Bytecount max_size_allocated;
  Bytecount bytelen;
  Charcount charlen;
  int mallocp;
  void *extdata;
  Extcount extlen;
} Eistring_;
Eistring_ the_eistring_zero_init;
#define Eistring(name) Eistring_ name = the_eistring_zero_init
/*   ----- Initialization -----   */
/* Make sure we can hold BYTELEN bytes plus a zero terminator.
   Preserve existing data as much as possible. */
#define EI_ALLOC_(ei, charlen, bytelen)       \
do {           \
  int ei1oldeibytelen = (ei).bytelen;       \
  int ei1newbytelen = bytelen;        \
  int ei1newcharlen = charlen;        \
           \
  (ei).charlen = ei1newcharlen;        \
  (ei).bytelen = ei1newbytelen;        \
           \
  if (ei1oldeibytelen != (ei).bytelen)       \
    {           \
      if ((ei).mallocp)         \
 /* xrealloc always preserves existing data as much as possible */ \
 (ei).data = xrealloc ((ei).data, (ei).bytelen + 1);    \
      else if ((ei).bytelen + 1 > (ei).max_size_allocated)    \
 {          \
   /* We don't have realloc, so just use the existing allocation   \
      if it's big enough; but remember how big it really is. */   \
   void *ei1oldeidata = (ei).data;      \
   (ei).max_size_allocated = (ei).bytelen + 1;     \
   (ei).data = alloca ((ei).max_size_allocated);     \
   memcpy ((ei).data, ei1oldeidata, ei1oldeibytelen);    \
 }          \
      ((char *) (ei).data)[(ei).bytelen] = '\0';     \
    }           \
} while (0)
#define EI_ALLOC_AND_COPY_(ei, data, charlen, bytelen) \
do {       \
  EI_ALLOC_ (ei, charlen, bytelen);   \
  memcpy ((ei).data, data, (ei).bytelen);  \
} while (0)
#define eicpy_ei(ei, eicpy)      \
do {         \
  Eistring_ *ei2 = &(eicpy);      \
  EI_ALLOC_AND_COPY_ (ei, ei2->data, ei2->charlen, ei2->bytelen); \
} while (0)
#define eicpy_str(ei, lisp_string)      \
do {          \
  Lisp_Object ei3 = (lisp_string);      \
  EI_ALLOC_AND_COPY_ (ei, XSTRING_DATA (ei3), XSTRING_CHAR_LENGTH (ei3), \
        XSTRING_LENGTH (ei3));     \
} while (0)
#ifdef ERROR_CHECK_BUFPOS
#define EI_ASSERT_ASCII_(ptr, len)
do {              \
  int ei5;             \
              \
  /* we use PTR and LEN multiply; we assume the callers have macro-protected \
     them. */             \
  for (ei5 = 0; ei5 < len; ei5++)          \
    assert (ptr[ei5] >= 0x20 && ptr[ei5] < 0x7F);        \
} while (0)
#else
#define EI_ASSERT_ASCII_(ptr, len)
#endif
#define eicpy_c(ei, c_string)   \
do {      \
  char *ei4 = (char *) (c_string);  \
      \
  EI_ASSERT_ASCII_ (ei4, strlen (ei4));  \
  eicpy_ext (ei, ei4c, Qbinary);  \
} while (0)
#define eicpy_c_len(ei, c_string, c_len) \
do {      \
  char *ei6 = (char *) (c_string);  \
  int ei6len = (c_len);    \
      \
  EI_ASSERT_ASCII_ (ei6, ei6len);  \
  eicpy_ext_len (ei, ei6, ei6len, Qbinary); \
} while (0)
#define eicpy_ext_len(ei, extdata, extlen, coding_system)   \
do {         \
  char *ei7 = (char *) (extdata);     \
  int ei7len = (extlen);      \
         \
  TO_INTERNAL_FORMAT (DATA, (ei7, ei7len),    \
        ALLOCA, ((ei).data, (ei).bytelen),  \
        coding_system);     \
  (ei).max_size_allocated = (ei).bytelen + 1;    \
  (ei).charlen = bytecount_to_charcount ((ei).data, (ei).bytelen); \
} while (0)
#define eicpy_ext(ei, extdata, coding_system)  \
do {       \
  char *ei8 = (char *) (extdata);   \
       \
  eicpy_ext_len (ei, ei8, strlen (ei8), coding_system); \
} while (0)
/*
   eicpy_str_off (eistr, lisp_string, charpos, charlen):
        ... from a section of a Lisp_Object string
   eicpy_str_off_byte (eistr, lisp_string, bytepos, bytelen):
        ... from a section of a Lisp_Object string, with offset and length
 specified in bytes rather than chars
   eicpy_buf (eistr, lisp_buf, charpos, charlen):
        ... from a Lisp_Object buffer
   eicpy_buf_byte (eistr, lisp_buf, bytepos, bytelen):
        ... from a Lisp_Object buffer, with offset and length specified in
 bytes rather than chars
   eicpy_raw (eistr, intdata, intlen, intfmt):
        ... from raw internal-format data in the specified format
   eicpy_lstream (eistr, lstream):
        ... from an lstream; reads data till eof.  Data must be in default
        internal format; otherwise, interpose a decoding lstream.
*/
/*   ----- Getting the data out of the Eistring -----   */
#define eirawdata(ei) ((ei).data)
/*
   eimake_string (eistr):
   eimake_string_sect (eistr, charpos, charlen):
   eimake_string_sect_byte (eistr, bytepos, bytelen):
   eicpyout_raw_alloca (eistr, intfmt, intlen_out):
   eicpyout_raw_malloc (eistr, intfmt, intlen_out):
   eicpyout_c_alloca (eistr):
   eicpyout_c_malloc (eistr):
   eicpyout_c_len_alloca (eistr, len_out):
   eicpyout_c_len_malloc (eistr, len_out):
*/
/*   ----- Moving to the heap -----   */
/*
   eito_malloc (eistr):
   eifree (eistr):
   eito_alloca (eistr):
*/
/*   ----- Retrieving the length -----   */
#define eilen(ei) ((ei).charlen)
#define eilen_byte(ei) ((ei).bytelen)
/*   ----- Working with positions -----   */
#define eicharpos_to_bytepos(ei, charpos) \
  charcount_to_bytecount ((ei).data, charpos)
#define eibytepos_to_charpos(ei, bytepos) \
  bytecount_to_charcount ((ei).data, bytepos)
/*   ----- Getting the character at a position -----   */
#define eiref(ei, charpos) charptr_emchar_n ((ei).data, charpos)
#define eiref_byte(ei, bytepos) \
  charptr_emchar ((char *) ((ei).data) + (bytepos))
/*   ----- Concatenation -----   */
#define eicat_ei(ei, ei2)     \
do {        \
  Eistring__ *ei9 = &(ei2);     \
  int ei9oldeibytelen = (ei).bytelen;    \
  EI_ALLOC_ (ei, (ei).charlen + ei9->charlen,   \
      (ei).bytelen + ei9->bytelen);   \
  memcpy ((char *) (ei).data + ei9oldeibytelen, ei9->data, \
   ei9->bytelen);     \
} while (0)
#define eicat_c(ei, c_string)   \
do {      \
  Eistring (ei10);    \
      \
  eicpy_c (ei10, c_string);   \
  eicat_ei (ei, ei10);    \
} while (0)
/*   ----- Replacement -----   */
/*
   eisub_* (eistr, charoff, charlen, ...):
   eisub_*_byte (eistr, byteoff, bytelen, ...):
        Replace a section of the Eistring.
   eisub_ei (eistr, charoff, charlen, eistr2):
   eisub_ei_byte (eistr, byteoff, bytelen, eistr2):
   eisub_c (eistr, charoff, charlen, c_string):
   eisub_c_byte (eistr, byteoff, bytelen, c_string):
*/
/*   ----- Converting to an external format -----   */
#define eito_external(ei, coding_system)   \
do {        \
  TO_EXTERNAL_FORMAT (DATA, ((ei).data, (ei).bytelen),  \
        ALLOCA, ((ei).extdata, (ei).extlen), \
        coding_system);    \
} while (0)
#define eiextdata(ei) ((ei).extdata)
#define eiextlen(ei) ((ei).extlen)
/*   ----- Searching in the Eistring for a character -----   */
/*
   eichr (eistr, chr):
   eichr_byte (eistr, chr):
   eichr_off (eistr, chr, charpos):
   eichr_off_byte (eistr, chr, bytepos):
   eirchr (eistr, chr):
   eirchr_byte (eistr, chr):
   eirchr_off (eistr, chr, charpos):
   eirchr_off_byte (eistr, chr, bytepos):
*/
/*   ----- Searching in the Eistring for a string -----   */
/*
   eistr_ei (eistr, eistr2):
   eistr_ei_byte (eistr, eistr2):
   eistr_ei_off (eistr, eistr2, charpos):
   eistr_ei_off_byte (eistr, eistr2, bytepos):
   eirstr_ei (eistr, eistr2):
   eirstr_ei_byte (eistr, eistr2):
   eirstr_ei_off (eistr, eistr2, charpos):
   eirstr_ei_off_byte (eistr, eistr2, bytepos):
   eistr_c (eistr, c_string):
   eistr_c_byte (eistr, c_string):
   eistr_c_off (eistr, c_string, charpos):
   eistr_c_off_byte (eistr, c_string, bytepos):
   eirstr_c (eistr, c_string):
   eirstr_c_byte (eistr, c_string):
   eirstr_c_off (eistr, c_string, charpos):
   eirstr_c_off_byte (eistr, c_string, bytepos):
*/
/*   ----- Comparison -----   */
/*
   eicmp_* (eistr, ...):
   eicmp_off_* (eistr, charoff, charlen, ...):
   eicmp_off_*_byte (eistr, byteoff, bytelen, ...):
   eicasecmp_* (eistr, ...):
   eicasecmp_off_* (eistr, charoff, charlen, ...):
   eicasecmp_off_*_byte (eistr, byteoff, bytelen, ...):
        Compare the Eistring with the other data.  Return value same as
        from strcmp.
   eicmp_ei (eistr, eistr2):
   eicmp_off_ei (eistr, charoff, charlen, eistr2):
   eicmp_off_ei_byte (eistr, byteoff, bytelen, eistr2):
   eicasecmp_ei (eistr, eistr2):
   eicasecmp_off_ei (eistr, charoff, charlen, eistr2):
   eicasecmp_off_ei_byte (eistr, byteoff, bytelen, eistr2):
   eicmp_c (eistr, c_string):
   eicmp_off_c (eistr, charoff, charlen, c_string):
   eicmp_off_c_byte (eistr, byteoff, bytelen, c_string):
   eicasecmp_c (eistr, c_string):
   eicasecmp_off_c (eistr, charoff, charlen, c_string):
   eicasecmp_off_c_byte (eistr, byteoff, bytelen, c_string):
*/
/*   ----- Case-changing the Eistring -----   */
int eistr_casefiddle_1 (Bufbyte *olddata, Bytecount len, Bufbyte *newdata,
   int downp);
#define EI_CASECHANGE_(ei, downp)     \
do {         \
  int ei11new_allocmax = (ei).charlen * MAX_EMCHAR_LEN + 1;  \
  Bufbyte *ei11storage = alloca_array (Bufbyte, ei11new_allocmax); \
  int ei11newlen = eistr_casefiddle_1 ((ei).data, (ei).bytelen,  \
      ei11storage, downp);  \
         \
  if (ei11newlen)       \
    {         \
      (ei).max_size_allocated = ei11new_allocmax;   \
      (ei).data = ei11storage;      \
      (ei).bytelen = ei11newlen;     \
      /* charlen is the same. */     \
    }         \
} while (0)
#define eilwr(ei) EI_CASECHANGE_ (ei, 1)
#define eiupr(ei) EI_CASECHANGE_ (ei, 0)
"Stephen J. Turnbull" wrote:
 Very long.  But if you get the idea that I am dumb, and sufficiently
 so as to discount your skills, I'd better be _very_ explicit.
 >>>>> "Ben" == Ben Wing <ben(a)666.com> writes:
 "Stephen J. Turnbull" wrote:
 >   Eistring (outline);
 >   eicpy_c (outline, "\033$B");
 >   eicpy_c (outline, "<+J,$G$7$g$&!#");  /* this should come from
an
 >                                            lstream or something     */
 >   eicpy_c (outline, "\033(B\n");
     Ben> Eistrings are supposed to hold Mule-internal data inside
     Ben> them.  They're definitely not supposed to hold random string
     Ben> stuff in them.  So I'm not quite sure of what you're trying
     Ben> to do.  Could you explain it in words?  Then I'll tell you
     Ben> how you could do it best, using Eistrings and/or other
     Ben> interfaces.
 What I want to do (in this example), basically, is to implement
 "encode-coding-region".  (I stated that, explicitly but in the wrong
 place, in the message you responded to.)  I have some Japanese text, I
 want to translate it to external format ISO-2022-JP.
 This requires translating the Japanese from internal encoding to 7-bit
 JIS X 0208 (presumably not the responsibility of an Eistring), then
 adding appropriate escape sequences (which are octets, not in any
 character set, if you take the ISO 2022 document seriously) before and
 after the Japanese (and the newline of course is ASCII, not JIS).
     Ben> If you want to copy or concatenate JIS format text, you need
     Ben> to mark it as JIS.  For example, you could imagine
     Ben> Eistring (outline);
     Ben> eicpy_ext (outline, "\033$B", Qjapanese_jis_0208);
 This is not at all what I have in mind; that string is the ISO-2022
 registered escape sequence for "designating JIS X 0208 to G0 and
 invoking G0 to GL."  I assumed you'd recognize one of those; if you
 don't, why should I just "trust you" to design an interface that will
 be used to implement handlers for them?  It is not itself JIS X 0208,
 and in fact that usage should cause XEmacs to barf() and abort().
     Ben> eicat_ext (outline, "<+J,$G$7$g$&!#", Qjapanese_jis_0208);
 I don't know what this code means.  That's not Japanese, that's a
 stream of octets for squirting out on some wire.  Ie, Qbinary.  I have
 no guess at what you think I'm thinking, here.
     Ben> eicat_ext (outline, "\033(B\n", Qjapanese_jis_0208);
     Ben> But the real problem here is that the data conversion that
     Ben> goes on inside the Eistrings is EXTREMELY SIMPLE and
     Ben> stateless.  In fact, it just uses TO_INTERNAL_FORMAT() and
     Ben> TO_EXTERNAL_FORMAT() for all conversions, which assume a
     Ben> complete encoding block.
 I was assuming (and said so) that the conversion to desired format of
 the second string was already done (eg by a "decoding lstream"); I
 just wanted to add some literals (the kanji-in and kanji-out escape
 sequences).
     Ben> In your example above, you'd need to concatenate the Japanese
     Ben> strings together separately, using some other mechanism, and
     Ben> then feed the whole thing to eicat_ext(), so that it would
     Ben> correctly encode it into Japanese.
 No, there's no Japanese there ("Japanese" doesn't really exist in a
 coherent way at the level Eistrings work at, IMO) ...
     Ben> Or alternatively, you might be wanting to feed the bytes
     Ben> directly into the string?
 ... as Lucy van Pelt would say, "THAT'S IT!!!!"
     Ben> Then you can go ahead, but use Qbinary, e.g.
     Ben> Eistring (outline);
     Ben> eicpy_ext (outline, "\033$B", Qbinary);
     Ben> eicat_ext (outline, "<+J,$G$7$g$&!#", Qbinary);
     Ben> eicat_ext (outline, "\033(B\n", Qbinary);
 Um, that's exactly what I wrote in the first place.  Except that I
 miswrote "cpy" for "cat".  And used your "convenience
function".  Not
 very convenient, after all, since my usage is illegal, it seems.  YMMV.
 (And if ESC is arguably ASCII, the related usage for inserting the C1
 (== 8-bit control) characters SS3 and SS4 (used as prefixes for JIS X
 0212 and 0201 characters in EUC-JP) would be certainly be forbidden.)
     Ben> but from the Eistring's perspective, you don't have Japanese
     Ben> in here; you have gobbledygook which could be decoded into
     Ben> Japanese.  But that's not part of the design of the Eistring
     Ben> -- it's supposed to entirely keep internally-formatted text
     Ben> in it, and provide routines to manipulate this
     Ben> internally-formatted text.
 Right!  Yes!  Exactly!  For example, implementing the Lisp functions
 that a MIME-capable mailer would use.  Or the lstream functions that
 those functions would call.  Or is that not what you have in mind?
     Ben> It seems that you're trying to rather ad-hoc-ish extend the
     Ben> design to do something else, and then you start complaing
     Ben> when there are problems!
 I'm trying to use the interface as my best guess suggests you intend
 it to be used.
     Ben> IMHO, my design is extremely well thought out.
 I've noticed.  But-but-but ...
     Ben> Take a look at the example function I just posted to Hrvoje
     Ben> showing how this all should be used.
 ... I did.
 What is the specification of eiextdata()?  I'd guess it just defaults
 the coding_system argument for eito_external().  But I can't think of
 a sensible way to do it, not even in the usage in your example.  For
 Unix, it has to involve getting the current filesystem coding-system,
 but that need not be correct (it would be quite possible to have a
 directory structure .../dir/, .../dir/JP/, .../dir/TW/ where the
 filenames under JP are encoded in EUC-JP and those under TW are in
 Big5).  And I'm not even sure that Win32 guarantees such a filesystem
 can't be constructed (say, using the Linux vfat filesystem---I know
 some users who would do that, and I sympathize with their goals,
 although not that implementation, they should be using Unicode ;),
 maybe even under Windows 9x by dual-booting Japanese and Taiwanese.
 And I assume that the documentation for mswindows_get_files specifies
 that the programmer _must_ insure that that "dirfile" contains only
 characters that Win32 can handle (we already have stock XEmacsen that
 can[1] handle all 2^31 UCS-4 characters, you know---AFAIK, Win32 does
 not, at best it handles UTF-16), since presumably Eistrings don't know
 about that?
 The point not being that Eistrings should nursemaid Win32 APIs; of
 course they shouldn't.  Rather that it's still very easy to write code
 that may be "mule-correct" and "mule-safe" as those terms are
defined
 in Winglish, but is quite capable of aiding or abetting undefined
 behavior (I've given two plausible examples in your sample function).
 Which is neither "correct" nor "safe", at least not in plain
English.
 So can you define what "correct" and "safe" mean so that we can
decide
 where the interface is going to help those goals by making intuitive
 distinctions between what we do and don't need to worry about?  (I
 think that "checking what's in dirfile" is easily seen to be not
 Eistring's job.  Good design!)  Or on the other hand, is it going to
 make it easy to write unobviously buggy code?  (I suspect that is true
 of eiextdata(), per the example above.  Doubleplusungood!)
 Footnotes:
 [1]  Trusting Morioka's off-hand remark.  If not all of them, at least
 say 2^24 or 2^28 of them.  Lots more than the 17*2^16 in UTF-16.
 --
 University of Tsukuba                Tennodai 1-1-1 Tsukuba 305-8573 JAPAN
 Institute of Policy and Planning Sciences       Tel/fax: +81 (298) 53-5091
 _________________  _________________  _________________  _________________
 What are those straight lines for?  "XEmacs rules." 
--
Ben
In order to save my hands, I am cutting back on my mail.  I also write
as succinctly as possible -- please don't be offended.  If you send me
mail, you _will_ get a response, but please be patient, especially for
XEmacs-related mail.  If you need an immediate response and it is not
apparent in your message, please say so.  Thanks for your understanding.
See also 
http://www.666.com/ben/typing.html.