CVS update by aidan xemacs/lisp ...
xemacs-cvs at xemacs.org
xemacs-cvs at xemacs.org
Tue Oct 2 06:33:06 EDT 2007
User: aidan
Date: 07/10/02 12:33:06
Modified: xemacs/lisp ChangeLog unicode.el
Log:
Fix some bugs in the Unicode error sequence Lisp API.
Revision Changes Path
1.845 +17 -0 XEmacs/xemacs/lisp/ChangeLog
Index: ChangeLog
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/ChangeLog,v
retrieving revision 1.844
retrieving revision 1.845
diff -u -p -r1.844 -r1.845
--- ChangeLog 2007/10/02 07:54:07 1.844
+++ ChangeLog 2007/10/02 10:33:04 1.845
@@ -1,3 +1,20 @@
+2007-09-09 Aidan Kehoe <kehoea at parhasard.net>
+
+ * unicode.el (unicode-error-default-translation-table):
+ New. A char table mapping from Unicode error octets (as
+ represented in XEmacs) to ASCII, Control-1 and latin-iso8859-1.
+
+ * unicode.el (unicode-error-sequence-regexp-range):
+ Correct it, add a build-time check that it matches every possible
+ error octet.
+
+ * unicode.el (frob-unicode-errors-region) New.
+ Apply a supplied function to each set of error sequences in a
+ region.
+ * unicode.el (unicode-error-translate-region) New.
+ Translate the error octets in a region to the corresponding
+ ASCII, control-1 and latin-1 characters.
+
2007-10-02 Didier Verna <didier at xemacs.org>
* objects.el (make-face-boolean-specifier): Fix docstring, really
1.24 +98 -30 XEmacs/xemacs/lisp/unicode.el
Index: unicode.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/xemacs/lisp/unicode.el,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -p -r1.23 -r1.24
--- unicode.el 2007/08/26 20:00:31 1.23
+++ unicode.el 2007/10/02 10:33:05 1.24
@@ -496,41 +496,109 @@ The second argument must be 'ucs, the th
;; Create all the Unicode error sequences, normally as jit-ucs-charset-0
;; characters starting at U+200000 (which isn't a valid Unicode code
- ;; point).
- (loop
- for i from #x00 to #xFF
- ;; #xd800 is the first leading surrogate; trailing surrogates must be in
- ;; the range #xdc00-#xdfff. These examples are not, so we intentionally
- ;; provoke an error sequence.
- do (decode-coding-string (format "\xd8\x00\x01%c" i) 'utf-16-be))
-
- ;; Make them available to user code.
- (defvar unicode-error-sequence-zero
- (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
- "The XEmacs character representing an invalid zero octet in Unicode.
-
-Subtract this character from each XEmacs character in an invalid sequence to
-get the octet on disk. E.g.
-
-\(- (aref (decode-coding-string ?\\x80 'utf-8) 0)
- unicode-error-characters-zero)
-=> ?\\x80
+ ;; point). Make them available to user code.
+ (defvar unicode-error-default-translation-table
+ (loop
+ with char-table = (make-char-table 'char)
+ for i from ?\x00 to ?\xFF
+ do
+ (put-char-table (aref
+ ;; #xd800 is the first leading surrogate;
+ ;; trailing surrogates must be in the range
+ ;; #xdc00-#xdfff. These examples are not, so we
+ ;; intentionally provoke an error sequence.
+ (decode-coding-string (format "\xd8\x00\x00%c" i)
+ 'utf-16-be)
+ 3)
+ i
+ char-table)
+ finally return char-table)
+ "Translation table mapping Unicode error sequences to Latin-1 chars.
-You can search for invalid sequences using
-`unicode-error-sequence-regexp-range', which see. ")
+To transform XEmacs Unicode error sequences to the Latin-1 characters that
+correspond to the octets on disk, you can use this variable. ")
(defvar unicode-error-sequence-regexp-range
- (format "%c-%c"
- (aref (decode-coding-string "\xd8\x00\x01\x00" 'utf-16-be) 3)
- (aref (decode-coding-string "\xd8\x00\x01\xFF" 'utf-16-be) 3))
+ (format "%c%c-%c"
+ (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 0)
+ (aref (decode-coding-string "\xd8\x00\x00\x00" 'utf-16-be) 3)
+ (aref (decode-coding-string "\xd8\x00\x00\xFF" 'utf-16-be) 3))
"Regular expression range to match Unicode error sequences in XEmacs.
-Invalid Unicode sequences on input are represented as XEmacs characters with
-values starting at `unicode-error-sequence-zero', one character for each
-invalid octet. Use this variable (with `re-search-forward' or
-`skip-chars-forward') to search for such characters; use
-`unicode-error-sequence-zero' from such characters to get a character
-corresponding to the octet on disk. "))
+Invalid Unicode sequences on input are represented as XEmacs
+characters with values stored as the keys in
+`unicode-error-default-translation-table', one character for each
+invalid octet. You can use this variable (with `re-search-forward' or
+`skip-chars-forward') to search for such characters; see also
+`unicode-error-translate-region'. ")
+
+ ;; Check that the lookup table is correct, and that all the actual error
+ ;; sequences are caught by the regexp.
+ (with-temp-buffer
+ (loop
+ for i from ?\x00 to ?\xFF
+ with to-check = (make-string 20 ?\x20)
+ with res = t
+ do
+ (delete-region (point-min) (point-max))
+ (insert to-check)
+ (goto-char 10)
+ (insert (decode-coding-string (format "\xd8\x00\x00%c" i)
+ 'utf-16-be))
+ (backward-char)
+ (assert (= i (get-char-table (char-after (point))
+ unicode-error-default-translation-table))
+ (format "Char ?\\x%x not the expected error sequence!"
+ i))
+
+ (goto-char (point-min))
+ (assert (re-search-forward (concat "["
+ unicode-error-sequence-regexp-range
+ "]"))
+ nil
+ (format "Could not find char ?\\x%x in buffer" i))))
+
+ (defun frob-unicode-errors-region (frob-function begin end &optional buffer)
+ "Call FROB-FUNCTION on the Unicode error sequences between BEGIN and END.
+
+Optional argument BUFFER specifies the buffer that should be examined for
+such sequences. "
+ (check-argument-type #'functionp frob-function)
+ (check-argument-range begin (point-min buffer) (point-max buffer))
+ (check-argument-range end (point-min buffer) (point-max buffer))
+ (save-excursion
+ (save-restriction
+ (if buffer (set-buffer buffer))
+ (narrow-to-region begin end)
+ (goto-char (point-min))
+ (while end
+ (setq begin
+ (progn
+ (skip-chars-forward
+ (concat "^" unicode-error-sequence-regexp-range))
+ (point))
+ end (and (not (= (point) (point-max)))
+ (progn
+ (skip-chars-forward
+ unicode-error-sequence-regexp-range)
+ (point))))
+ (if end
+ (funcall frob-function begin end))))))
+
+ (defun unicode-error-translate-region (begin end &optional buffer table)
+ "Translate the Unicode error sequences in BUFFER between BEGIN and END.
+
+The error sequences are transformed, by default, into the ASCII,
+control-1 and latin-iso8859-1 characters with the numeric values
+corresponding to the incorrect octets encountered. This is achieved
+by using `unicode-error-default-translation-table' (which see) for
+TABLE; you can change this by supplying another character table,
+mapping from the error sequences to the desired characters. "
+ (unless table (setq table unicode-error-default-translation-table))
+ (frob-unicode-errors-region
+ (lambda (start finish)
+ (translate-region start finish table))
+ begin end buffer)))
;; #### UTF-7 is not yet implemented, and it's tricky to do. There's
;; an implementation in appendix A.1 of the Unicode Standard, Version
More information about the XEmacs-CVS
mailing list