stephen, i need to talk to you on the phone
that way i'll have a reasonable chance of answering your ?'s before my hands
fall off.
let me know your # and your available hours.
btw i've posted my implementation so far.
/* ------------------------------ */
/* (E) For working with Eistrings */
/* ------------------------------ */
/* Note: Unfortunately, we have to write most of the Eistring functions as
macros, because of the use of alloca(). The principle used below to assure
no conflict in local variables is to prefix all local variables with "ei"
plus a number, which should be unique among macros. In practice, when
finding a new number, use one greater than all existing numbers. */
typedef struct
{
void *data;
Bytecount max_size_allocated;
Bytecount bytelen;
Charcount charlen;
int mallocp;
void *extdata;
Extcount extlen;
} Eistring_;
Eistring_ the_eistring_zero_init;
#define Eistring(name) Eistring_ name = the_eistring_zero_init
/* ----- Initialization ----- */
/* Make sure we can hold BYTELEN bytes plus a zero terminator.
Preserve existing data as much as possible. */
#define EI_ALLOC_(ei, charlen, bytelen) \
do { \
int ei1oldeibytelen = (ei).bytelen; \
int ei1newbytelen = bytelen; \
int ei1newcharlen = charlen; \
\
(ei).charlen = ei1newcharlen; \
(ei).bytelen = ei1newbytelen; \
\
if (ei1oldeibytelen != (ei).bytelen) \
{ \
if ((ei).mallocp) \
/* xrealloc always preserves existing data as much as possible */ \
(ei).data = xrealloc ((ei).data, (ei).bytelen + 1); \
else if ((ei).bytelen + 1 > (ei).max_size_allocated) \
{ \
/* We don't have realloc, so just use the existing allocation \
if it's big enough; but remember how big it really is. */ \
void *ei1oldeidata = (ei).data; \
(ei).max_size_allocated = (ei).bytelen + 1; \
(ei).data = alloca ((ei).max_size_allocated); \
memcpy ((ei).data, ei1oldeidata, ei1oldeibytelen); \
} \
((char *) (ei).data)[(ei).bytelen] = '\0'; \
} \
} while (0)
#define EI_ALLOC_AND_COPY_(ei, data, charlen, bytelen) \
do { \
EI_ALLOC_ (ei, charlen, bytelen); \
memcpy ((ei).data, data, (ei).bytelen); \
} while (0)
#define eicpy_ei(ei, eicpy) \
do { \
Eistring_ *ei2 = &(eicpy); \
EI_ALLOC_AND_COPY_ (ei, ei2->data, ei2->charlen, ei2->bytelen); \
} while (0)
#define eicpy_str(ei, lisp_string) \
do { \
Lisp_Object ei3 = (lisp_string); \
EI_ALLOC_AND_COPY_ (ei, XSTRING_DATA (ei3), XSTRING_CHAR_LENGTH (ei3), \
XSTRING_LENGTH (ei3)); \
} while (0)
#ifdef ERROR_CHECK_BUFPOS
#define EI_ASSERT_ASCII_(ptr, len)
do { \
int ei5; \
\
/* we use PTR and LEN multiply; we assume the callers have macro-protected \
them. */ \
for (ei5 = 0; ei5 < len; ei5++) \
assert (ptr[ei5] >= 0x20 && ptr[ei5] < 0x7F); \
} while (0)
#else
#define EI_ASSERT_ASCII_(ptr, len)
#endif
#define eicpy_c(ei, c_string) \
do { \
char *ei4 = (char *) (c_string); \
\
EI_ASSERT_ASCII_ (ei4, strlen (ei4)); \
eicpy_ext (ei, ei4c, Qbinary); \
} while (0)
#define eicpy_c_len(ei, c_string, c_len) \
do { \
char *ei6 = (char *) (c_string); \
int ei6len = (c_len); \
\
EI_ASSERT_ASCII_ (ei6, ei6len); \
eicpy_ext_len (ei, ei6, ei6len, Qbinary); \
} while (0)
#define eicpy_ext_len(ei, extdata, extlen, coding_system) \
do { \
char *ei7 = (char *) (extdata); \
int ei7len = (extlen); \
\
TO_INTERNAL_FORMAT (DATA, (ei7, ei7len), \
ALLOCA, ((ei).data, (ei).bytelen), \
coding_system); \
(ei).max_size_allocated = (ei).bytelen + 1; \
(ei).charlen = bytecount_to_charcount ((ei).data, (ei).bytelen); \
} while (0)
#define eicpy_ext(ei, extdata, coding_system) \
do { \
char *ei8 = (char *) (extdata); \
\
eicpy_ext_len (ei, ei8, strlen (ei8), coding_system); \
} while (0)
/*
eicpy_str_off (eistr, lisp_string, charpos, charlen):
... from a section of a Lisp_Object string
eicpy_str_off_byte (eistr, lisp_string, bytepos, bytelen):
... from a section of a Lisp_Object string, with offset and length
specified in bytes rather than chars
eicpy_buf (eistr, lisp_buf, charpos, charlen):
... from a Lisp_Object buffer
eicpy_buf_byte (eistr, lisp_buf, bytepos, bytelen):
... from a Lisp_Object buffer, with offset and length specified in
bytes rather than chars
eicpy_raw (eistr, intdata, intlen, intfmt):
... from raw internal-format data in the specified format
eicpy_lstream (eistr, lstream):
... from an lstream; reads data till eof. Data must be in default
internal format; otherwise, interpose a decoding lstream.
*/
/* ----- Getting the data out of the Eistring ----- */
#define eirawdata(ei) ((ei).data)
/*
eimake_string (eistr):
eimake_string_sect (eistr, charpos, charlen):
eimake_string_sect_byte (eistr, bytepos, bytelen):
eicpyout_raw_alloca (eistr, intfmt, intlen_out):
eicpyout_raw_malloc (eistr, intfmt, intlen_out):
eicpyout_c_alloca (eistr):
eicpyout_c_malloc (eistr):
eicpyout_c_len_alloca (eistr, len_out):
eicpyout_c_len_malloc (eistr, len_out):
*/
/* ----- Moving to the heap ----- */
/*
eito_malloc (eistr):
eifree (eistr):
eito_alloca (eistr):
*/
/* ----- Retrieving the length ----- */
#define eilen(ei) ((ei).charlen)
#define eilen_byte(ei) ((ei).bytelen)
/* ----- Working with positions ----- */
#define eicharpos_to_bytepos(ei, charpos) \
charcount_to_bytecount ((ei).data, charpos)
#define eibytepos_to_charpos(ei, bytepos) \
bytecount_to_charcount ((ei).data, bytepos)
/* ----- Getting the character at a position ----- */
#define eiref(ei, charpos) charptr_emchar_n ((ei).data, charpos)
#define eiref_byte(ei, bytepos) \
charptr_emchar ((char *) ((ei).data) + (bytepos))
/* ----- Concatenation ----- */
#define eicat_ei(ei, ei2) \
do { \
Eistring__ *ei9 = &(ei2); \
int ei9oldeibytelen = (ei).bytelen; \
EI_ALLOC_ (ei, (ei).charlen + ei9->charlen, \
(ei).bytelen + ei9->bytelen); \
memcpy ((char *) (ei).data + ei9oldeibytelen, ei9->data, \
ei9->bytelen); \
} while (0)
#define eicat_c(ei, c_string) \
do { \
Eistring (ei10); \
\
eicpy_c (ei10, c_string); \
eicat_ei (ei, ei10); \
} while (0)
/* ----- Replacement ----- */
/*
eisub_* (eistr, charoff, charlen, ...):
eisub_*_byte (eistr, byteoff, bytelen, ...):
Replace a section of the Eistring.
eisub_ei (eistr, charoff, charlen, eistr2):
eisub_ei_byte (eistr, byteoff, bytelen, eistr2):
eisub_c (eistr, charoff, charlen, c_string):
eisub_c_byte (eistr, byteoff, bytelen, c_string):
*/
/* ----- Converting to an external format ----- */
#define eito_external(ei, coding_system) \
do { \
TO_EXTERNAL_FORMAT (DATA, ((ei).data, (ei).bytelen), \
ALLOCA, ((ei).extdata, (ei).extlen), \
coding_system); \
} while (0)
#define eiextdata(ei) ((ei).extdata)
#define eiextlen(ei) ((ei).extlen)
/* ----- Searching in the Eistring for a character ----- */
/*
eichr (eistr, chr):
eichr_byte (eistr, chr):
eichr_off (eistr, chr, charpos):
eichr_off_byte (eistr, chr, bytepos):
eirchr (eistr, chr):
eirchr_byte (eistr, chr):
eirchr_off (eistr, chr, charpos):
eirchr_off_byte (eistr, chr, bytepos):
*/
/* ----- Searching in the Eistring for a string ----- */
/*
eistr_ei (eistr, eistr2):
eistr_ei_byte (eistr, eistr2):
eistr_ei_off (eistr, eistr2, charpos):
eistr_ei_off_byte (eistr, eistr2, bytepos):
eirstr_ei (eistr, eistr2):
eirstr_ei_byte (eistr, eistr2):
eirstr_ei_off (eistr, eistr2, charpos):
eirstr_ei_off_byte (eistr, eistr2, bytepos):
eistr_c (eistr, c_string):
eistr_c_byte (eistr, c_string):
eistr_c_off (eistr, c_string, charpos):
eistr_c_off_byte (eistr, c_string, bytepos):
eirstr_c (eistr, c_string):
eirstr_c_byte (eistr, c_string):
eirstr_c_off (eistr, c_string, charpos):
eirstr_c_off_byte (eistr, c_string, bytepos):
*/
/* ----- Comparison ----- */
/*
eicmp_* (eistr, ...):
eicmp_off_* (eistr, charoff, charlen, ...):
eicmp_off_*_byte (eistr, byteoff, bytelen, ...):
eicasecmp_* (eistr, ...):
eicasecmp_off_* (eistr, charoff, charlen, ...):
eicasecmp_off_*_byte (eistr, byteoff, bytelen, ...):
Compare the Eistring with the other data. Return value same as
from strcmp.
eicmp_ei (eistr, eistr2):
eicmp_off_ei (eistr, charoff, charlen, eistr2):
eicmp_off_ei_byte (eistr, byteoff, bytelen, eistr2):
eicasecmp_ei (eistr, eistr2):
eicasecmp_off_ei (eistr, charoff, charlen, eistr2):
eicasecmp_off_ei_byte (eistr, byteoff, bytelen, eistr2):
eicmp_c (eistr, c_string):
eicmp_off_c (eistr, charoff, charlen, c_string):
eicmp_off_c_byte (eistr, byteoff, bytelen, c_string):
eicasecmp_c (eistr, c_string):
eicasecmp_off_c (eistr, charoff, charlen, c_string):
eicasecmp_off_c_byte (eistr, byteoff, bytelen, c_string):
*/
/* ----- Case-changing the Eistring ----- */
int eistr_casefiddle_1 (Bufbyte *olddata, Bytecount len, Bufbyte *newdata,
int downp);
#define EI_CASECHANGE_(ei, downp) \
do { \
int ei11new_allocmax = (ei).charlen * MAX_EMCHAR_LEN + 1; \
Bufbyte *ei11storage = alloca_array (Bufbyte, ei11new_allocmax); \
int ei11newlen = eistr_casefiddle_1 ((ei).data, (ei).bytelen, \
ei11storage, downp); \
\
if (ei11newlen) \
{ \
(ei).max_size_allocated = ei11new_allocmax; \
(ei).data = ei11storage; \
(ei).bytelen = ei11newlen; \
/* charlen is the same. */ \
} \
} while (0)
#define eilwr(ei) EI_CASECHANGE_ (ei, 1)
#define eiupr(ei) EI_CASECHANGE_ (ei, 0)
"Stephen J. Turnbull" wrote:
Very long. But if you get the idea that I am dumb, and sufficiently
so as to discount your skills, I'd better be _very_ explicit.
>>>>> "Ben" == Ben Wing <ben(a)666.com> writes:
"Stephen J. Turnbull" wrote:
> Eistring (outline);
> eicpy_c (outline, "\033$B");
> eicpy_c (outline, "<+J,$G$7$g$&!#"); /* this should come from
an
> lstream or something */
> eicpy_c (outline, "\033(B\n");
Ben> Eistrings are supposed to hold Mule-internal data inside
Ben> them. They're definitely not supposed to hold random string
Ben> stuff in them. So I'm not quite sure of what you're trying
Ben> to do. Could you explain it in words? Then I'll tell you
Ben> how you could do it best, using Eistrings and/or other
Ben> interfaces.
What I want to do (in this example), basically, is to implement
"encode-coding-region". (I stated that, explicitly but in the wrong
place, in the message you responded to.) I have some Japanese text, I
want to translate it to external format ISO-2022-JP.
This requires translating the Japanese from internal encoding to 7-bit
JIS X 0208 (presumably not the responsibility of an Eistring), then
adding appropriate escape sequences (which are octets, not in any
character set, if you take the ISO 2022 document seriously) before and
after the Japanese (and the newline of course is ASCII, not JIS).
Ben> If you want to copy or concatenate JIS format text, you need
Ben> to mark it as JIS. For example, you could imagine
Ben> Eistring (outline);
Ben> eicpy_ext (outline, "\033$B", Qjapanese_jis_0208);
This is not at all what I have in mind; that string is the ISO-2022
registered escape sequence for "designating JIS X 0208 to G0 and
invoking G0 to GL." I assumed you'd recognize one of those; if you
don't, why should I just "trust you" to design an interface that will
be used to implement handlers for them? It is not itself JIS X 0208,
and in fact that usage should cause XEmacs to barf() and abort().
Ben> eicat_ext (outline, "<+J,$G$7$g$&!#", Qjapanese_jis_0208);
I don't know what this code means. That's not Japanese, that's a
stream of octets for squirting out on some wire. Ie, Qbinary. I have
no guess at what you think I'm thinking, here.
Ben> eicat_ext (outline, "\033(B\n", Qjapanese_jis_0208);
Ben> But the real problem here is that the data conversion that
Ben> goes on inside the Eistrings is EXTREMELY SIMPLE and
Ben> stateless. In fact, it just uses TO_INTERNAL_FORMAT() and
Ben> TO_EXTERNAL_FORMAT() for all conversions, which assume a
Ben> complete encoding block.
I was assuming (and said so) that the conversion to desired format of
the second string was already done (eg by a "decoding lstream"); I
just wanted to add some literals (the kanji-in and kanji-out escape
sequences).
Ben> In your example above, you'd need to concatenate the Japanese
Ben> strings together separately, using some other mechanism, and
Ben> then feed the whole thing to eicat_ext(), so that it would
Ben> correctly encode it into Japanese.
No, there's no Japanese there ("Japanese" doesn't really exist in a
coherent way at the level Eistrings work at, IMO) ...
Ben> Or alternatively, you might be wanting to feed the bytes
Ben> directly into the string?
... as Lucy van Pelt would say, "THAT'S IT!!!!"
Ben> Then you can go ahead, but use Qbinary, e.g.
Ben> Eistring (outline);
Ben> eicpy_ext (outline, "\033$B", Qbinary);
Ben> eicat_ext (outline, "<+J,$G$7$g$&!#", Qbinary);
Ben> eicat_ext (outline, "\033(B\n", Qbinary);
Um, that's exactly what I wrote in the first place. Except that I
miswrote "cpy" for "cat". And used your "convenience
function". Not
very convenient, after all, since my usage is illegal, it seems. YMMV.
(And if ESC is arguably ASCII, the related usage for inserting the C1
(== 8-bit control) characters SS3 and SS4 (used as prefixes for JIS X
0212 and 0201 characters in EUC-JP) would be certainly be forbidden.)
Ben> but from the Eistring's perspective, you don't have Japanese
Ben> in here; you have gobbledygook which could be decoded into
Ben> Japanese. But that's not part of the design of the Eistring
Ben> -- it's supposed to entirely keep internally-formatted text
Ben> in it, and provide routines to manipulate this
Ben> internally-formatted text.
Right! Yes! Exactly! For example, implementing the Lisp functions
that a MIME-capable mailer would use. Or the lstream functions that
those functions would call. Or is that not what you have in mind?
Ben> It seems that you're trying to rather ad-hoc-ish extend the
Ben> design to do something else, and then you start complaing
Ben> when there are problems!
I'm trying to use the interface as my best guess suggests you intend
it to be used.
Ben> IMHO, my design is extremely well thought out.
I've noticed. But-but-but ...
Ben> Take a look at the example function I just posted to Hrvoje
Ben> showing how this all should be used.
... I did.
What is the specification of eiextdata()? I'd guess it just defaults
the coding_system argument for eito_external(). But I can't think of
a sensible way to do it, not even in the usage in your example. For
Unix, it has to involve getting the current filesystem coding-system,
but that need not be correct (it would be quite possible to have a
directory structure .../dir/, .../dir/JP/, .../dir/TW/ where the
filenames under JP are encoded in EUC-JP and those under TW are in
Big5). And I'm not even sure that Win32 guarantees such a filesystem
can't be constructed (say, using the Linux vfat filesystem---I know
some users who would do that, and I sympathize with their goals,
although not that implementation, they should be using Unicode ;),
maybe even under Windows 9x by dual-booting Japanese and Taiwanese.
And I assume that the documentation for mswindows_get_files specifies
that the programmer _must_ insure that that "dirfile" contains only
characters that Win32 can handle (we already have stock XEmacsen that
can[1] handle all 2^31 UCS-4 characters, you know---AFAIK, Win32 does
not, at best it handles UTF-16), since presumably Eistrings don't know
about that?
The point not being that Eistrings should nursemaid Win32 APIs; of
course they shouldn't. Rather that it's still very easy to write code
that may be "mule-correct" and "mule-safe" as those terms are
defined
in Winglish, but is quite capable of aiding or abetting undefined
behavior (I've given two plausible examples in your sample function).
Which is neither "correct" nor "safe", at least not in plain
English.
So can you define what "correct" and "safe" mean so that we can
decide
where the interface is going to help those goals by making intuitive
distinctions between what we do and don't need to worry about? (I
think that "checking what's in dirfile" is easily seen to be not
Eistring's job. Good design!) Or on the other hand, is it going to
make it easy to write unobviously buggy code? (I suspect that is true
of eiextdata(), per the example above. Doubleplusungood!)
Footnotes:
[1] Trusting Morioka's off-hand remark. If not all of them, at least
say 2^24 or 2^28 of them. Lots more than the 17*2^16 in UTF-16.
--
University of Tsukuba Tennodai 1-1-1 Tsukuba 305-8573 JAPAN
Institute of Policy and Planning Sciences Tel/fax: +81 (298) 53-5091
_________________ _________________ _________________ _________________
What are those straight lines for? "XEmacs rules."
--
Ben
In order to save my hands, I am cutting back on my mail. I also write
as succinctly as possible -- please don't be offended. If you send me
mail, you _will_ get a response, but please be patient, especially for
XEmacs-related mail. If you need an immediate response and it is not
apparent in your message, please say so. Thanks for your understanding.
See also
http://www.666.com/ben/typing.html.