APPROVE COMMIT
NOTE: This patch has been committed
# HG changeset patch
# User Aidan Kehoe <kehoea(a)parhasard.net>
# Date 1509006891 -3600
# Thu Oct 26 09:34:51 2017 +0100
# Node ID 32aa4c87fa7f91e059717f5a88300469d7c3e9d7
# Parent 3fa6bfc3ea4bb6bf9f2b354b9c5927232e23d7b0
Don't encode private code-points by default in UTF-8, #'encode-coding-region
src/ChangeLog addition:
2017-10-26 Aidan Kehoe <kehoea(a)parhasard.net>
* mule-coding.c (iso2022_encode):
Pass the new ALLOW_PRIVATE argument to encode_unicode_to_dynarr().
* unicode.c:
* unicode.c (struct unicode_coding_system):
Add a new allow_private field here, respected on encoding and
decoding.
* unicode.c (CODING_SYSTEM_UNICODE_ALLOW_PRIVATE):
New macro.
* unicode.c (decode_utf_8):
Make ALLOW_PRIVATE a Boolint in this function.
* unicode.c (encode_unicode_to_dynarr):
Take a new ALLOW_PRIVATE argument.
* unicode.c (unicode_decode):
* unicode.c (unicode_encode):
Use the ALLOW_PRIVATE coding system property in these two functions.
* unicode.c (unicode_putprop):
Implement the ALLOW_PRIVATE property.
* unicode.c (syms_of_unicode):
Make Qallow_private available.
* unicode.h:
Declare encode_unicode_to_dynarr() as having an ALLOW_PRIVATE
argument.
diff -r 3fa6bfc3ea4b -r 32aa4c87fa7f src/ChangeLog
--- a/src/ChangeLog Thu Oct 19 06:47:13 2017 +0100
+++ b/src/ChangeLog Thu Oct 26 09:34:51 2017 +0100
@@ -1,3 +1,28 @@
+2017-10-26 Aidan Kehoe <kehoea(a)parhasard.net>
+
+ * mule-coding.c (iso2022_encode):
+ Pass the new ALLOW_PRIVATE argument to encode_unicode_to_dynarr().
+ * unicode.c:
+ * unicode.c (struct unicode_coding_system):
+ Add a new allow_private field here, respected on encoding and
+ decoding.
+ * unicode.c (CODING_SYSTEM_UNICODE_ALLOW_PRIVATE):
+ New macro.
+ * unicode.c (decode_utf_8):
+ Make ALLOW_PRIVATE a Boolint in this function.
+ * unicode.c (encode_unicode_to_dynarr):
+ Take a new ALLOW_PRIVATE argument.
+ * unicode.c (unicode_decode):
+ * unicode.c (unicode_encode):
+ Use the ALLOW_PRIVATE coding system property in these two functions.
+ * unicode.c (unicode_putprop):
+ Implement the ALLOW_PRIVATE property.
+ * unicode.c (syms_of_unicode):
+ Make Qallow_private available.
+ * unicode.h:
+ Declare encode_unicode_to_dynarr() as having an ALLOW_PRIVATE
+ argument.
+
2017-10-19 Aidan Kehoe <kehoea(a)parhasard.net>
* symbols.c (Fapropos_internal):
diff -r 3fa6bfc3ea4b -r 32aa4c87fa7f src/mule-coding.c
--- a/src/mule-coding.c Thu Oct 19 06:47:13 2017 +0100
+++ b/src/mule-coding.c Thu Oct 26 09:34:51 2017 +0100
@@ -3102,7 +3102,8 @@
int code = ichar_to_unicode (ich, CONVERR_FAIL);
if (encode_unicode_to_dynarr
(code, str, src, dst, UNICODE_UTF_8, 0,
- XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys)) < 0)
+ XCODING_SYSTEM_ISO2022_ESCAPE_QUOTED (codesys),
+ 1) < 0)
{
ENCODING_ERROR_RETURN_OR_CONTINUE (str, src);
}
diff -r 3fa6bfc3ea4b -r 32aa4c87fa7f src/unicode.c
--- a/src/unicode.c Thu Oct 19 06:47:13 2017 +0100
+++ b/src/unicode.c Thu Oct 26 09:34:51 2017 +0100
@@ -202,7 +202,7 @@
Lisp_Object Qunicode;
Lisp_Object Qutf_16, Qutf_8, Qucs_4, Qutf_7, Qutf_32;
-Lisp_Object Qneed_bom;
+Lisp_Object Qneed_bom, Qallow_private;
Lisp_Object Qutf_16_little_endian, Qutf_16_bom;
Lisp_Object Qutf_16_little_endian_bom;
@@ -2705,6 +2705,7 @@
enum unicode_encoding_type type;
unsigned int little_endian :1;
unsigned int need_bom :1;
+ unsigned int allow_private :1;
};
#define CODING_SYSTEM_UNICODE_TYPE(codesys) \
@@ -2719,6 +2720,10 @@
(CODING_SYSTEM_TYPE_DATA (codesys, unicode)->need_bom)
#define XCODING_SYSTEM_UNICODE_NEED_BOM(codesys) \
CODING_SYSTEM_UNICODE_NEED_BOM (XCODING_SYSTEM (codesys))
+#define CODING_SYSTEM_UNICODE_ALLOW_PRIVATE(codesys) \
+ (CODING_SYSTEM_TYPE_DATA (codesys, unicode)->allow_private)
+#define XCODING_SYSTEM_UNICODE_ALLOW_PRIVATE(codesys) \
+ CODING_SYSTEM_UNICODE_ALLOW_PRIVATE (XCODING_SYSTEM (codesys))
static const struct memory_description unicode_coding_system_description[] = {
{ XD_END }
@@ -2777,7 +2782,7 @@
void
decode_utf_8 (struct unicode_coding_stream *data, unsigned_char_dynarr *dst,
- UExtbyte c, int ignore_bom, int allow_private)
+ UExtbyte c, int ignore_bom, Boolint allow_private)
{
if (0 == data->counter)
{
@@ -2942,7 +2947,8 @@
unsigned_char_dynarr *dst,
enum unicode_encoding_type type,
int little_endian,
- int preserve_error_characters)
+ Boolint preserve_error_characters,
+ Boolint allow_private)
{
int err = 0;
if (code == -1)
@@ -3024,11 +3030,25 @@
register int bytes;
register unsigned char *dstp;
+ reconsider_length:
if (code <= 0x7ff) bytes = 2;
else if (code <= 0xffff) bytes = 3;
- else if (code <= 0x1fffff) bytes = 4;
- else if (code <= 0x3ffffff) bytes = 5;
- else bytes = 6;
+ else if (code <= UNICODE_OFFICIAL_MAX) bytes = 4;
+ else if (allow_private)
+ {
+ if (code <= 0x1fffff) bytes = 4;
+ else if (code <= 0x3ffffff) bytes = 5;
+ else bytes = 6;
+ }
+ else
+ {
+ /* Not valid Unicode. Pass the replacement char (U+FFFD). */
+ handle_encoding_error_before_output (str, src, dst, 1,
+ CODING_UNENCODABLE);
+ err = -1;
+ code = CANT_CONVERT_CHAR_WHEN_ENCODING_UNICODE;
+ goto reconsider_length;
+ }
Dynarr_add_many (dst, 0, bytes);
dstp = Dynarr_past_lastp (dst);
@@ -3091,6 +3111,7 @@
int little_endian =
XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (str->codesys);
int ignore_bom = XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys);
+ Boolint allow_private = XCODING_SYSTEM_UNICODE_ALLOW_PRIVATE (str->codesys);
Bytecount orign = n;
int counter = data->counter;
@@ -3103,7 +3124,7 @@
while (n--)
{
UExtbyte c = *src++;
- decode_utf_8 (data, dst, c, ignore_bom, 0);
+ decode_utf_8 (data, dst, c, ignore_bom, allow_private);
}
counter = data->counter;
ch = data->ch;
@@ -3339,6 +3360,8 @@
XCODING_SYSTEM_UNICODE_TYPE (str->codesys);
int little_endian =
XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (str->codesys);
+ const Boolint allow_private
+ = XCODING_SYSTEM_UNICODE_ALLOW_PRIVATE (str->codesys);
const Ibyte *srcend = src + n;
#ifdef ENABLE_COMPOSITE_CHARS
@@ -3353,8 +3376,9 @@
if (XCODING_SYSTEM_UNICODE_NEED_BOM (str->codesys) && !data->wrote_bom)
{
- assert (encode_unicode_to_dynarr (0xFEFF, str, src, dst, type,
- little_endian, 0) >= 0);
+ text_checking_assert (encode_unicode_to_dynarr (0xFEFF, str, src, dst,
+ type, little_endian,
+ 0, allow_private) >= 0);
data->wrote_bom = 1;
}
@@ -3366,8 +3390,10 @@
if (byte_ascii_p (c))
#endif /* MULE */
{
- assert (encode_unicode_to_dynarr (c, str, src, dst, type,
- little_endian, 0) >= 0);
+ text_checking_assert (encode_unicode_to_dynarr (c, str, src, dst,
+ type, little_endian,
+ 0, allow_private)
+ >= 0);
src++;
}
#ifdef MULE
@@ -3383,7 +3409,7 @@
#ifdef UNICODE_INTERNAL
if (encode_unicode_to_dynarr (ich, str, src, dst, type,
- little_endian, 0) < 0)
+ little_endian, 0, allow_private) < 0)
{
ENCODING_ERROR_RETURN_OR_CONTINUE (str, src);
}
@@ -3398,7 +3424,8 @@
/* #### Bother! We don't know how to
handle this yet. */
encode_unicode_to_dynarr (-1, str, src, dst,
- type, little_endian, 0);
+ type, little_endian, 0,
+ allow_private);
ENCODING_ERROR_RETURN_OR_CONTINUE (str, src);
}
else
@@ -3423,7 +3450,8 @@
charset_codepoint_to_unicode
(charset, c1, c2, CONVERR_FAIL);
if (encode_unicode_to_dynarr (code, str, src, dst, type,
- little_endian, 0) < 0)
+ little_endian, 0,
+ allow_private) < 0)
{
ENCODING_ERROR_RETURN_OR_CONTINUE (str, src);
}
@@ -3807,6 +3835,8 @@
XCODING_SYSTEM_UNICODE_LITTLE_ENDIAN (codesys) = !NILP (value);
else if (EQ (key, Qneed_bom))
XCODING_SYSTEM_UNICODE_NEED_BOM (codesys) = !NILP (value);
+ else if (EQ (key, Qallow_private))
+ XCODING_SYSTEM_UNICODE_ALLOW_PRIVATE (codesys) = !NILP (value);
else
return 0;
return 1;
@@ -3844,6 +3874,8 @@
write_ascstring (printcharfun, ", little-endian");
if (XCODING_SYSTEM_UNICODE_NEED_BOM (cs))
write_ascstring (printcharfun, ", need-bom");
+ if (XCODING_SYSTEM_UNICODE_ALLOW_PRIVATE (cs))
+ write_ascstring (printcharfun, ", allow-private");
write_ascstring (printcharfun, ")");
}
@@ -3918,6 +3950,7 @@
DEFSYMBOL (Qutf_7);
DEFSYMBOL (Qneed_bom);
+ DEFSYMBOL (Qallow_private);
DEFSYMBOL (Qutf_16);
DEFSYMBOL (Qutf_16_little_endian);
diff -r 3fa6bfc3ea4b -r 32aa4c87fa7f src/unicode.h
--- a/src/unicode.h Thu Oct 19 06:47:13 2017 +0100
+++ b/src/unicode.h Thu Oct 26 09:34:51 2017 +0100
@@ -94,10 +94,11 @@
unsigned_char_dynarr *dst,
enum unicode_encoding_type type,
int little_endian,
- int preserve_error_characters);
+ Boolint preserve_error_characters,
+ Boolint allow_private);
void decode_utf_8 (struct unicode_coding_stream *data,
unsigned_char_dynarr *dst, UExtbyte c, int ignore_bom,
- int allow_private);
+ Boolint allow_private);
void decode_unicode_to_dynarr (int ucs, unsigned_char_dynarr *dst);
--
‘As I sat looking up at the Guinness ad, I could never figure out /
How your man stayed up on the surfboard after forty pints of stout’
(C. Moore)