Recently Tomo Morioka has been doing some work on using UTF-8 as the
Mule string and buffer encoding, complementary to Olivier Galibert's
work on use of UCS characters (including Unicode).
The appended patch swaps the Lisp integer with the Lisp character
representations, so that characters have 31 bits of precision. This
allows Lisp characters to represent the entire UCS-4 code space. I
don't see that loss of one bit of precision in integers hurts; where
it matters (time values, etc) we need 32 bits anyway, and classical
Emacsen didn't have even 30 bits.
The patch has one known bug (== lack of ./configure code; I just
compile with "-DBASKIN_ROBBINS_CHAR" in CFLAGS, but I think this
change should just be made and be done with), and a couple of points
I'm unsure about.
First, in src/data.c and src/print.c I changed "switch (XTYPE(obj))
..." to "if (LRECORDP(obj)) ... else if (CHARP(obj) ..." so that these
control structures don't depend on the representation of Lisp
character and Lisp integer types. I think the efficiency hit should
be negligible, but I'd like a qualified opinion.
Second, I tracked down all uses of Lisp_Type_* and am fairly confident
that these are correct in the patch. I have this nagging feeling I'm
missing something that I should be verifying, though.
Third, I defined an enum member in terms of a previously defined
member of the same enum (Lisp_Type). Is this portable? It makes it
clearer what's going on, but I could use a comment instead.
The changes to src/alloc.c, src/data.c, and src/print.c are
unconditional. The changes to src/lisp.h, src/lisp-union.h, and
src/lisp-disunion.h are #ifdef'd with BASKIN_ROBBINS_CHAR. If that is
not #define'd, the current MINIMAL_TAGBITS behavior is preserved.
As I mentioned above, I think we should Just Do It; UCS-4 is coming.
Comments?
1999-06-22 Stephen J. Turnbull <turnbull(a)sk.tsukuba.ac.jp>
* data.c (Ftype_of), print.c (print_internal): Substitute use
of predicate macros (eg INTP) for enum Lisp_Type values.
* alloc.c, lisp.h: Move definition of Lisp_Type_Int to lisp.h.
* lisp.h, lisp-union.h, lisp-disunion.h: Allow 31-bit
characters, 30-bit integers when #ifdef BASKIN_ROBBINS_CHAR
(otherwise reversed == standard MINIMAL_TAGBITS behavior).
Lisp_Type_Wider_Bit substituted for Lisp_Type_Int_Bit (applies
to characters when they are 31 bits).
Lisp_Type_Char_{Even,Odd}: new enum Lisp_Type members.
Remove uses of [CON]CHECK_NONRECORD.
* lrecord.h: #if 0 definition of [CON]CHECK_NONRECORD.
Index: src/alloc.c
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/alloc.c,v
retrieving revision 1.42.2.11
diff -u -r1.42.2.11 alloc.c
--- alloc.c 1999/05/24 00:32:44 1.42.2.11
+++ alloc.c 1999/06/22 06:56:05
@@ -512,6 +512,7 @@
about expressions in src/gdbinit. See src/gdbinit or src/dbxrc to
see how this is used. */
+/* #### Need to check how this is applied to chars vs. ints for UCS-4. */
EMACS_UINT dbg_valmask = ((1UL << VALBITS) - 1) << GCBITS;
EMACS_UINT dbg_typemask = (1UL << GCTYPEBITS) - 1;
@@ -521,7 +522,6 @@
unsigned char dbg_USE_UNION_TYPE = 0;
#endif
-unsigned char Lisp_Type_Int = 100;
unsigned char Lisp_Type_Cons = 101;
unsigned char Lisp_Type_String = 102;
unsigned char Lisp_Type_Vector = 103;
Index: src/data.c
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/data.c,v
retrieving revision 1.24.2.3
diff -u -r1.24.2.3 data.c
--- data.c 1999/04/22 08:39:08 1.24.2.3
+++ data.c 1999/06/22 06:56:05
@@ -543,15 +543,14 @@
*/
(object))
{
- switch (XTYPE (object))
- {
- case Lisp_Type_Record:
+ if (LRECORDP (object))
return intern (XRECORD_LHEADER_IMPLEMENTATION (object)->name);
- case Lisp_Type_Char: return Qcharacter;
+ else if (CHARP (object))
+ return Qcharacter;
- default: return Qinteger;
- }
+ else
+ return Qinteger;
}
Index: src/lisp-disunion.h
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/lisp-disunion.h,v
retrieving revision 1.9.2.2
diff -u -r1.9.2.2 lisp-disunion.h
--- lisp-disunion.h 1999/04/22 07:27:23 1.9.2.2
+++ lisp-disunion.h 1999/06/22 06:56:06
@@ -30,7 +30,9 @@
--------------------------------
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVTT
- Integers are treated specially, and look like this:
+ Either integers (in the original MINIMAL_TAGBITS format) or
+ characters (when BASKIN_ROBBINS_CHAR) are treated specially, and look like
+ this:
3 2 1 0
bit 10987654321098765432109876543210
@@ -42,9 +44,13 @@
The object is obtained by masking off the type and mark bits.
Bit 1 is used as a value bit by splitting the Lisp integer type
- into two subtypes, Lisp_Type_Int_Even and Lisp_Type_Int_Odd. By
- this trickery we get 31 bits for integers instead of 30.
+ (UCS-4 character type) into two subtypes, Lisp_Type_Int_Even
+ (Lisp_Type_Char_Even) and Lisp_Type_Int_Odd (Lisp_Type_Char_Odd).
+ By this trickery we get 31 bits for integers (characters) instead
+ of 30.
+ The wider type is tested for by anding with Lisp_Type_Wider_Bit.
+
For non-integral types, the value bits of a Lisp_Object contain
a pointer to a structure containing the object. The pointer is
obtained by masking off the type and mark bits.
@@ -56,6 +62,7 @@
hardware. Because of this, Lisp_Object pointers don't have
to be masked and are full-sized.
+
There are no mark bits.
Integers and characters don't need to be marked. All other types
are lrecord-based, which means they get marked by incrementing
@@ -69,24 +76,39 @@
XREALINT The value bits of a Lisp_Object storing an integer, signed
XUINT The value bits of a Lisp_Object storing an integer, unsigned
INTP Non-zero if this Lisp_Object an integer?
+ CHARP Non-zero if this Lisp_Object an character?
Qzero Lisp Integer 0
EQ Non-zero if two Lisp_Objects are identical
- GC_EQ Version of EQ used during garbage collection */
+ GC_EQ Version of EQ used during garbage collection */
typedef EMACS_INT Lisp_Object;
-#define Lisp_Type_Int_Bit (Lisp_Type_Int_Even & Lisp_Type_Int_Odd)
#define make_obj(vartype, x) ((Lisp_Object) (x))
-#define make_int(x) ((Lisp_Object) (((x) << INT_GCBITS) | Lisp_Type_Int_Bit))
-#define make_char(x) ((Lisp_Object) (((x) << GCBITS) | Lisp_Type_Char))
+#ifndef BASKIN_ROBBINS_CHAR
+#define Lisp_Type_Wider_Bit (Lisp_Type_Int_Even & Lisp_Type_Int_Odd)
+#define make_int(x) ((Lisp_Object) (((x) << INT_GCBITS) | Lisp_Type_Wider_Bit))
+#define make_char(x) ((Lisp_Object) (((x) << CHAR_GCBITS) | Lisp_Type_Char))
+#else
+#define Lisp_Type_Wider_Bit (Lisp_Type_Char_Even & Lisp_Type_Char_Odd)
+#define make_int(x) ((Lisp_Object) (((x) << INT_GCBITS) | Lisp_Type_Int))
+#define make_char(x) ((Lisp_Object) (((x) << CHAR_GCBITS) | Lisp_Type_Wider_Bit))
+#endif
#define VALMASK (((1UL << VALBITS) - 1UL) << GCTYPEBITS)
#define XTYPE(x) ((enum Lisp_Type) (((EMACS_UINT)(x)) & ~VALMASK))
#define XPNTRVAL(x) (x) /* This depends on Lisp_Type_Record == 0 */
-#define XCHARVAL(x) ((x) >> GCBITS)
+#define XCHARVAL(x) ((x) >> CHAR_GCBITS)
#define GC_EQ(x,y) EQ (x,y)
#define XREALINT(x) ((x) >> INT_GCBITS)
#define XUINT(x) ((EMACS_UINT)(x) >> INT_GCBITS)
-#define INTP(x) ((EMACS_UINT)(x) & Lisp_Type_Int_Bit)
+#ifdef BASKIN_ROBBINS_CHAR
+#define INTP(x) ((EMACS_UINT)(x) & Lisp_Type_Wider_Bit)
+#define CHARP(x) (XTYPE (x) == Lisp_Type_Char)
+#define GC_CHARP(x) (XGCTYPE (x) == Lisp_Type_Char)
+#else
+#define INTP(x) ((EMACS_UINT)(x) == Lisp_Type_Int)
+#define CHARP(x) (XTYPE (x) & Lisp_Type_Wider_Bit)
+#define GC_CHARP(x) (XGCTYPE (x) & Lisp_Type_Wider_Bit)
+#endif
#define Qzero make_int (0)
#define Qnull_pointer ((Lisp_Object) 0)
Index: src/lisp-union.h
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/lisp-union.h,v
retrieving revision 1.8.2.2
diff -u -r1.8.2.2 lisp-union.h
--- lisp-union.h 1999/04/22 07:27:23 1.8.2.2
+++ lisp-union.h 1999/06/22 06:56:06
@@ -48,7 +48,13 @@
EMACS_UINT val : INT_VALBITS;
unsigned int bits : INT_GCBITS;
} u;
-#else /* non-valbits are at higher addresses */
+
+ struct
+ {
+ EMACS_UINT val : CHAR_VALBITS;
+ unsigned int bits : CHAR_GCBITS;
+ } c;
+#else /* non-valbits are at higher addresses */
struct
{
enum_field (Lisp_Type) type : GCTYPEBITS;
@@ -67,6 +73,11 @@
EMACS_UINT val : INT_VALBITS;
} u;
+ struct
+ {
+ unsigned int bits : CHAR_GCBITS;
+ EMACS_UINT val : CHAR_VALBITS;
+ } c;
#endif /* non-valbits are at higher addresses */
EMACS_UINT ui;
@@ -80,8 +91,9 @@
}
Lisp_Object;
-#define XCHARVAL(x) ((x).gu.val)
+#define XCHARVAL(x) ((x).c.val)
+#ifndef BASKIN_ROBBINS_CHAR
# define XSETINT(var, value) do { \
EMACS_INT xset_value = (value); \
Lisp_Object *xset_var = &(var); \
@@ -94,6 +106,21 @@
xset_var->gu.type = Lisp_Type_Char; \
xset_var->gu.val = xset_value; \
} while (0)
+#else
+# define XSETINT(var, value) do { \
+ EMACS_INT xset_value = (value); \
+ Lisp_Object *xset_var = &(var); \
+ xset_var->s.bits = Lisp_Type_Int; \
+ xset_var->s.val = xset_value; \
+} while (0)
+# define XSETCHAR(var, value) do { \
+ Emchar xset_value = (value); \
+ Lisp_Object *xset_var = &(var); \
+ xset_var->c.bits = 1; \
+ xset_var->c.val = xset_value; \
+} while (0)
+#endif
+
# define XSETOBJ(var, vartype, value) do { \
EMACS_UINT xset_value = (EMACS_UINT) (value); \
(var).ui = xset_value; \
@@ -126,7 +153,16 @@
#define XGCTYPE(x) XTYPE (x)
#define EQ(x,y) ((x).v == (y).v)
+#ifndef BASKIN_ROBBINS_CHAR
#define INTP(x) ((x).s.bits)
+#define CHARP(x) (XTYPE (x) == Lisp_Type_Char)
+#define GC_CHARP(x) (XGCTYPE (x) == Lisp_Type_Char)
+#else
+#define INTP(x) (XTYPE (x) == Lisp_Type_Int)
+#define CHARP(x) ((x).c.bits)
+#define GC_CHARP(x) ((x).c.bits)
+#endif
+
#define GC_EQ(x,y) EQ (x, y)
/* Convert between a (void *) and a Lisp_Object, as when the
Index: src/lisp.h
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/lisp.h,v
retrieving revision 1.38.2.13
diff -u -r1.38.2.13 lisp.h
--- lisp.h 1999/06/13 21:45:15 1.38.2.13
+++ lisp.h 1999/06/22 06:56:07
@@ -510,16 +510,34 @@
/* Define the fundamental Lisp data structures */
-/* This is the set of Lisp data types */
+/* This is the set of Lisp data types
+ Lisp_Type_Record is invariant due to the undesirability of masking
+ pointers before use. The others, and their usage, change with
+ BASKIN_ROBBINS_CHAR and should not be used outside of the lisp*.h files.
+ Use the constructor and predicate macros instead. */
+
+/* #### Is the use of enum constants within the enum definition portable? */
enum Lisp_Type
{
Lisp_Type_Record,
+#ifndef BASKIN_ROBBINS_CHAR
+ Lisp_Type_Int_Even,
+ Lisp_Type_Char_Even,
+ Lisp_Type_Int_Odd,
+ Lisp_Type_Char_Odd = Lisp_Type_Char_Even
+#else
+ Lisp_Type_Char_Even,
Lisp_Type_Int_Even,
- Lisp_Type_Char,
- Lisp_Type_Int_Odd
+ Lisp_Type_Char_Odd,
+ Lisp_Type_Int_Odd = Lisp_Type_Int_Even
+#endif
};
+/* For debugger. Avoid introducing BASKIN_ROBBINS_CHAR to alloc.c. */
+#define Lisp_Type_Int (Lisp_Type_Int_Even & Lisp_Type_Int_Odd)
+#define Lisp_Type_Char (Lisp_Type_Char_Even & Lisp_Type_Char_Odd)
+
#define POINTER_TYPE_P(type) ((type) == Lisp_Type_Record)
/* EMACS_INT is the underlying integral type into which a Lisp_Object must fit.
@@ -556,9 +574,16 @@
#define GCMARKBITS 0
#define GCTYPEBITS 2
#define GCBITS 2
+#ifndef BASKIN_ROBBINS_CHAR
#define INT_GCBITS 1
+#define CHAR_GCBITS 2
+#else
+#define INT_GCBITS 2
+#define CHAR_GCBITS 1
+#endif
#define INT_VALBITS (BITS_PER_EMACS_INT - INT_GCBITS)
+#define CHAR_VALBITS (BITS_PER_EMACS_INT - CHAR_GCBITS)
#define VALBITS (BITS_PER_EMACS_INT - GCBITS)
#define EMACS_INT_MAX ((1UL << INT_VALBITS) -1UL)
@@ -1212,9 +1237,6 @@
/*********** char ***********/
-#define CHARP(x) (XTYPE (x) == Lisp_Type_Char)
-#define GC_CHARP(x) (XGCTYPE (x) == Lisp_Type_Char)
-
#ifdef ERROR_CHECK_TYPECHECK
INLINE Emchar XCHAR (Lisp_Object obj);
@@ -1231,9 +1253,17 @@
#endif
-#define CHECK_CHAR(x) CHECK_NONRECORD (x, Lisp_Type_Char, Qcharacterp)
-#define CONCHECK_CHAR(x) CONCHECK_NONRECORD (x, Lisp_Type_Char, Qcharacterp)
+/* CONCHECK_CHAR (etc) used to be defined via the CONCHECK_NONRECORD
+ (resp.) macros, but that's just obfuscatory now */
+#define CHECK_CHAR(x) do { \
+ if (!CHARP (x)) \
+ dead_wrong_type_argument (Qintegerp, x); \
+} while (0)
+#define CONCHECK_CHAR(x) do { \
+ if (!CHARP (x)) \
+ x = wrong_type_argument (Qintegerp, x); \
+} while (0)
/*********** float ***********/
Index: src/lrecord.h
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/lrecord.h,v
retrieving revision 1.8.2.3
diff -u -r1.8.2.3 lrecord.h
--- lrecord.h 1999/04/23 20:57:15 1.8.2.3
+++ lrecord.h 1999/06/22 06:56:07
@@ -342,18 +342,24 @@
if (!RECORD_TYPEP (x, &lrecord_##c_name)) \
x = wrong_type_argument (Q##c_name##p, x); \
} while (0)
-#define CONCHECK_NONRECORD(x, lisp_enum, predicate) do {\
- if (XTYPE (x) != lisp_enum) \
- x = wrong_type_argument (predicate, x); \
- } while (0)
#define CHECK_RECORD(x, c_name) do { \
if (!RECORD_TYPEP (x, &lrecord_##c_name)) \
dead_wrong_type_argument (Q##c_name##p, x); \
} while (0)
+
+/* CONCHECK_NONRECORD etc only works for one of Lisp_Type_Char and
+ Lisp_Type_Int, and which one depends on BASKIN_ROBBINS_CHAR. Let's
+ not use them. */
+#if 0
+#define CONCHECK_NONRECORD(x, lisp_enum, predicate) do {\
+ if (XTYPE (x) != lisp_enum) \
+ x = wrong_type_argument (predicate, x); \
+ } while (0)
#define CHECK_NONRECORD(x, lisp_enum, predicate) do { \
if (XTYPE (x) != lisp_enum) \
dead_wrong_type_argument (predicate, x); \
} while (0)
+#endif
void *alloc_lcrecord (size_t size, CONST struct lrecord_implementation *);
Index: src/print.c
===================================================================
RCS file: /usr/CVSroot/XEmacs/xemacs-20/src/print.c,v
retrieving revision 1.27.2.5
diff -u -r1.27.2.5 print.c
--- print.c 1999/05/21 05:32:05 1.27.2.5
+++ print.c 1999/06/22 06:56:07
@@ -1106,18 +1106,15 @@
if (print_depth > PRINT_CIRCLE)
error ("Apparently circular structure being printed");
- switch (XTYPE (obj))
- {
- case Lisp_Type_Int_Even:
- case Lisp_Type_Int_Odd:
+ { /* harmless compound statement avoids unnecessary reformatting. */
+ if (INTP (obj))
{
char buf[24];
long_to_string (buf, XINT (obj));
write_c_string (buf, printcharfun);
- break;
}
- case Lisp_Type_Char:
+ else if (CHARP (obj))
{
/* God intended that this be #\..., you know. */
char buf[16];
@@ -1154,38 +1151,36 @@
else
p += set_charptr_emchar ((Bufbyte *)p, ch);
output_string (printcharfun, (Bufbyte *)buf, Qnil, 0, p - buf);
- break;
}
- case Lisp_Type_Record:
+ else if (LRECORDP (obj))
{
struct lrecord_header *lheader = XRECORD_LHEADER (obj);
struct gcpro gcpro1, gcpro2;
- if (CONSP (obj) || VECTORP(obj))
- {
+ if ((CONSP (obj) || VECTORP(obj))
/* If deeper than spec'd depth, print placeholder. */
- if (INTP (Vprint_level)
- && print_depth > XINT (Vprint_level))
- {
- GCPRO2 (obj, printcharfun);
- write_c_string ("...", printcharfun);
- UNGCPRO;
- break;
- }
+ && INTP (Vprint_level)
+ && print_depth > XINT (Vprint_level))
+ {
+ GCPRO2 (obj, printcharfun);
+ write_c_string ("...", printcharfun);
+ UNGCPRO;
}
- GCPRO2 (obj, printcharfun);
- if (LHEADER_IMPLEMENTATION (lheader)->printer)
- ((LHEADER_IMPLEMENTATION (lheader)->printer)
- (obj, printcharfun, escapeflag));
else
- default_object_printer (obj, printcharfun, escapeflag);
- UNGCPRO;
- break;
+ {
+ GCPRO2 (obj, printcharfun);
+ if (LHEADER_IMPLEMENTATION (lheader)->printer)
+ ((LHEADER_IMPLEMENTATION (lheader)->printer)
+ (obj, printcharfun, escapeflag));
+ else
+ default_object_printer (obj, printcharfun, escapeflag);
+ UNGCPRO;
+ }
}
- default:
+ else
{
#ifdef ERROR_CHECK_TYPECHECK
abort ();
@@ -1203,7 +1198,6 @@
(" Save your buffers immediately and please report this bug>",
printcharfun);
#endif /* not ERROR_CHECK_TYPECHECK */
- break;
}
}
--
University of Tsukuba Tennodai 1-1-1 Tsukuba 305-8573 JAPAN
Institute of Policy and Planning Sciences Tel/fax: +81 (298) 53-5091
__________________________________________________________________________
__________________________________________________________________________
What are those two straight lines for? "Free software rules."