commit: Add check-coding-systems-region, test it and others, fix some bugs.

Tuesday, 30 December 2008


        changeset:   4570:e6a7054a9c3063f1257ae9f66c5dfdef7f02bf4c
user:        Aidan Kehoe <kehoea(a)parhasard.net&gt;
date:        Sun Dec 28 22:51:14 2008 +0000
files:       lisp/ChangeLog lisp/coding.el lisp/unicode.el tests/ChangeLog
tests/automated/query-coding-tests.el
description:
Add check-coding-systems-region, test it and others, fix some bugs.

tests/ChangeLog addition:

2008-12-28  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* automated/query-coding-tests.el:
	Add tests for #'unencodable-char-position,
	#'check-coding-systems-region, #'encode-coding-char. Remove some
	debugging statements.

lisp/ChangeLog addition:

2008-12-28  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* coding.el (query-coding-region):
	(query-coding-string):
	Make these defsubsts, they're short enough and they're called
	explicitly rarely enough that it make some sense. The alternative
	would be compiler macros that avoid the binding of the arguments.
	(unencodable-char-position):
	Document where the docstring and API are from.
	Correct a special case for zero--check-argument-type returns nil
	when it succeeds, we can't usefully chain its result in an and
	here.
	(check-coding-systems-region): New. API taken from GNU; docstring
	and implementation are independent.
	(encode-coding-char):
	Add an optional third argument, as used by recent GNU. Document
	the origen of the docstring.
	(default-query-coding-region): Add a short docstring to the
	non-Mule implementation of this function.
	* unicode.el:
	Don't set the query-coding-function property for unicode coding
	systems if we're on non-mule. Unintern
	unicode-query-coding-region, unicode-query-coding-skip-chars-arg
	in the same context.


diff -r 80e0588fb42f5f7d5530de33cdc64d6cbafdb026 -r
e6a7054a9c3063f1257ae9f66c5dfdef7f02bf4c lisp/ChangeLog
--- a/lisp/ChangeLog	Sun Dec 28 14:55:02 2008 +0000
+++ b/lisp/ChangeLog	Sun Dec 28 22:51:14 2008 +0000
＠＠ -1,3 +1,28 ＠＠ 2008-12-28  Aidan Kehoe  <kehoea＠parhasa
+2008-12-28  Aidan Kehoe  <kehoea(a)parhasard.net&gt;
+
+	* coding.el (query-coding-region): 
+	(query-coding-string): 
+	Make these defsubsts, they're short enough and they're called
+	explicitly rarely enough that it make some sense. The alternative
+	would be compiler macros that avoid the binding of the arguments. 
+	(unencodable-char-position): 
+	Document where the docstring and API are from. 
+	Correct a special case for zero--check-argument-type returns nil
+	when it succeeds, we can't usefully chain its result in an and
+	here. 
+	(check-coding-systems-region): New. API taken from GNU; docstring
+	and implementation are independent. 
+	(encode-coding-char): 
+	Add an optional third argument, as used by recent GNU. Document
+	the origen of the docstring. 
+	(default-query-coding-region): Add a short docstring to the
+	non-Mule implementation of this function. 
+	* unicode.el: 
+	Don't set the query-coding-function property for unicode coding
+	systems if we're on non-mule. Unintern
+	unicode-query-coding-region, unicode-query-coding-skip-chars-arg
+	in the same context. 
+
 2008-12-28  Aidan Kehoe  <kehoea(a)parhasard.net&gt;
 
 	* coding.el (default-query-coding-region): 
diff -r 80e0588fb42f5f7d5530de33cdc64d6cbafdb026 -r
e6a7054a9c3063f1257ae9f66c5dfdef7f02bf4c lisp/coding.el
--- a/lisp/coding.el	Sun Dec 28 14:55:02 2008 +0000
+++ b/lisp/coding.el	Sun Dec 28 22:51:14 2008 +0000
＠＠ -398,7 +398,7 ＠＠ addition, characters that can be safely 
 	    (values nil ranges)
 	  (values t nil))))))
 
-(defun query-coding-region (start end coding-system &optional buffer
+(defsubst query-coding-region (start end coding-system &optional buffer
                                errorp highlight)
   "Work out whether CODING-SYSTEM can losslessly encode a region.
 
＠＠ -423,7 +423,7 ＠＠ describing the positions of the unencoda
                #'default-query-coding-region)
            start end coding-system buffer errorp highlight))
 
-(defun query-coding-string (string coding-system &optional errorp highlight)
+(defsubst query-coding-string (string coding-system &optional errorp highlight)
   "Work out whether CODING-SYSTEM can losslessly encode STRING.
 CODING-SYSTEM is the coding system to check.
 
＠＠ -446,6 +446,7 ＠＠ describing the positions of the unencoda
                          ;; ### Will highlight work here?
                          errorp highlight)))
 
+;; Function docstring and API are taken from GNU coding.c version 1.353, GPLv2. 
 (defun unencodable-char-position  (start end coding-system
                                    &optional count string) 
   "Return position of first un-encodable character in a region.
＠＠ -486,9 +487,9 ＠＠ in the string."
     (check-argument-type #'integer-or-marker-p start)
     (check-argument-type #'integer-or-marker-p end)
     (check-coding-system coding-system)
-    (and count (check-argument-type #'natnump count)
-	 ;; Special-case zero, sigh. 
-	 (if (zerop count) (setq count 1)))
+    (when count (check-argument-type #'natnump count)
+	  ;; Special-case zero, sigh. 
+	  (if (zerop count) (setq count 1)))
     (and string (check-argument-type #'stringp string))
     (if string
 	(with-temp-buffer
＠＠ -496,9 +497,64 ＠＠ in the string."
 	  (funcall thunk start end coding-system count))
       (funcall thunk start end coding-system count))))
 
-(defun encode-coding-char (char coding-system)
+;; XEmacs; this is a GPLv3 function in coding.c in GNU. This is why we have
+;; both a very divergent docstring and a very divergent implementation.
+(defun check-coding-systems-region (begin end coding-system-list)
+  "Can coding systems in CODING-SYSTEM-LIST encode text in a region?
+
+CODING-SYSTEM-LIST must be a list of coding systems.  BEGIN and END are
+normally buffer positions delimiting the region.  If some coding system in
+CODING-SYSTEM-LIST cannot encode the entire region, the return value of this
+function is an alist mapping coding system names to lists of individual
+buffer positions (not ranges) that the individual coding systems cannot
+encode.
+
+If all coding systems in CODING-SYSTEM-LIST can encode the region,
+this function returns t.  This conflicts with the documented, but not
+with the observed, GNU behavior.
+
+If BEGIN is a string, `check-coding-systems-region' ignores END, and checks
+whether the coding systems can encode BEGIN.  The alist that is returned
+uses zero-based string indices, not one-based buffer positions.
+
+This function is for GNU compatibility.  See also `query-coding-region'."
+  (let ((thunk
+	 #'(lambda (begin end coding-system-list stringp)
+	     (loop
+               for coding-system in coding-system-list
+               with result = nil
+               with intermediate = nil
+               with range-lambda = (if stringp
+                                       #'(lambda (begin end value)
+                                           (while (< begin end)
+                                             (push (1- begin) intermediate)
+                                             (incf begin)))
+                                     #'(lambda (begin end value)
+                                         (while (< begin end)
+                                           (push begin intermediate)
+                                           (incf begin))))
+               do (setq coding-system (check-coding-system coding-system))
+               (multiple-value-bind (encoded ranges)
+		   (query-coding-region begin end coding-system)
+                 (unless encoded
+                   (setq intermediate (list (coding-system-name coding-system)))
+                   (map-range-table range-lambda ranges)
+                   (push (nreverse intermediate) result)))
+               finally return (or result t)))))
+  (if (stringp begin)
+      (with-temp-buffer
+	(insert begin)
+	(funcall thunk (point-min) (point-max) coding-system-list t))
+    (check-argument-type #'integer-or-marker-p begin)
+    (check-argument-type #'integer-or-marker-p end)
+    (funcall thunk begin end coding-system-list nil))))
+
+;; XEmacs; docstring taken from GNU, international/mule-cmds.el, revision
+;; 1.311, GPLv2. 
+(defun encode-coding-char (char coding-system &optional charset)
   "Encode CHAR by CODING-SYSTEM and return the resulting string.
-If CODING-SYSTEM can't safely encode CHAR, return nil."
+If CODING-SYSTEM can't safely encode CHAR, return nil.
+The optional third argument CHARSET is, for the moment, ignored."
   (check-argument-type #'characterp char)
   (multiple-value-bind (succeededp)
       (query-coding-string char coding-system)
＠＠ -509,7 +565,9 ＠＠ If CODING-SYSTEM can't safely encode CHA
   ;; If we're under non-Mule, every XEmacs character can be encoded
   ;; with every XEmacs coding system.
   (fset #'default-query-coding-region
-	#'(lambda (&rest ignored) (values t nil)))
+	#'(lambda (&rest ignored)
+	    "Stub `query-coding-region' implementation. Always succeeds."
+	    (values t nil)))
   (unintern 'default-query-coding-region-safe-charset-skip-chars-map))
 
 ;;; coding.el ends here
diff -r 80e0588fb42f5f7d5530de33cdc64d6cbafdb026 -r
e6a7054a9c3063f1257ae9f66c5dfdef7f02bf4c lisp/unicode.el
--- a/lisp/unicode.el	Sun Dec 28 14:55:02 2008 +0000
+++ b/lisp/unicode.el	Sun Dec 28 22:51:14 2008 +0000
＠＠ -678,6 +678,7 ＠＠ mapping from the error sequences to the 
 
 (loop
   for coding-system in (coding-system-list)
+  initially (unless (featurep 'mule) (return))
   do (when (eq 'unicode (coding-system-type coding-system))
        (coding-system-put coding-system 'query-coding-function
 			  #'unicode-query-coding-region)))
＠＠ -691,7 +692,8 ＠＠ mapping from the error sequences to the 
   (mapcar #'unintern
           '(ccl-encode-to-ucs-2 unicode-error-default-translation-table
             unicode-invalid-regexp-range frob-unicode-errors-region
-            unicode-error-translate-region)))
+            unicode-error-translate-region unicode-query-coding-region
+            unicode-query-coding-skip-chars-arg)))
 
 ;; #### UTF-7 is not yet implemented, and it's tricky to do.  There's
 ;; an implementation in appendix A.1 of the Unicode Standard, Version
diff -r 80e0588fb42f5f7d5530de33cdc64d6cbafdb026 -r
e6a7054a9c3063f1257ae9f66c5dfdef7f02bf4c tests/ChangeLog
--- a/tests/ChangeLog	Sun Dec 28 14:55:02 2008 +0000
+++ b/tests/ChangeLog	Sun Dec 28 22:51:14 2008 +0000
＠＠ -1,3 +1,10 ＠＠ 2008-12-28  Aidan Kehoe  <kehoea＠parhasa
+2008-12-28  Aidan Kehoe  <kehoea(a)parhasard.net&gt;
+
+	* automated/query-coding-tests.el: 
+	Add tests for #'unencodable-char-position,
+	#'check-coding-systems-region, #'encode-coding-char. Remove some
+	debugging statements.
+
 2008-12-28  Aidan Kehoe  <kehoea(a)parhasard.net&gt;
 
 	* automated/query-coding-tests.el: 
diff -r 80e0588fb42f5f7d5530de33cdc64d6cbafdb026 -r
e6a7054a9c3063f1257ae9f66c5dfdef7f02bf4c tests/automated/query-coding-tests.el
--- a/tests/automated/query-coding-tests.el	Sun Dec 28 14:55:02 2008 +0000
+++ b/tests/automated/query-coding-tests.el	Sun Dec 28 22:51:14 2008 +0000
＠＠ -91,58 +91,31 ＠＠
                                                   coding-system))
         (multiple-value-bind (query-coding-succeeded query-coding-table)
             (query-coding-region (point-min) (point-max) coding-system)
-          (q-c-debug "checking type, coding-system, q-c-s, q-c-t %S"
-                     (list (coding-system-type coding-system)
-                           coding-system query-coding-succeeded
-                           query-coding-table))
-          (unless (and (eq t query-coding-succeeded)
-                       (null query-coding-table))
-            (q-c-debug "(eq t query-coding-succeeded) %S, (\
-null query-coding-table) %S" (eq t query-coding-succeeded)
-                             (null query-coding-table)))
-          (Assert (eq t query-coding-succeeded))
-          (Assert (null query-coding-table)))
-        (q-c-debug "testing the ASCII strings for %S" coding-system)
+          (Assert (eq t query-coding-succeeded))
+          (Assert (null query-coding-table)))
         (multiple-value-bind (query-coding-succeeded query-coding-table)
             (query-coding-string ascii-chars-string coding-system)
-          (unless (and (eq t query-coding-succeeded)
-                       (null query-coding-table))
-            (q-c-debug "(eq t query-coding-succeeded) %S, (\
-null query-coding-table) %S" (eq t query-coding-succeeded)
-                             (null query-coding-table)))
           (Assert (eq t query-coding-succeeded))
           (Assert (null query-coding-table))))
-      (q-c-debug "past the loop through the coding systems")
       (delete-region (point-min) (point-max))
       ;; Check for success from the two Latin-1 coding systems 
       (insert latin-1-chars-string)
-      (q-c-debug "point is now %S" (point))
       (multiple-value-bind (query-coding-succeeded query-coding-table)
           (query-coding-region (point-min) (point-max) 'iso-8859-1-unix)
         (Assert (eq t query-coding-succeeded))
         (Assert (null query-coding-table)))
-      (q-c-debug "point is now %S" (point))
       (multiple-value-bind (query-coding-succeeded query-coding-table)
           (query-coding-string (buffer-string) 'iso-8859-1-unix)
         (Assert (eq t query-coding-succeeded))
         (Assert (null query-coding-table)))
-      (q-c-debug "point is now %S" (point))
       (multiple-value-bind (query-coding-succeeded query-coding-table)
           (query-coding-string (buffer-string) 'iso-latin-1-with-esc-unix)
         (Assert (eq t query-coding-succeeded))
         (Assert (null query-coding-table)))
-      (q-c-debug "point is now %S" (point))
       ;; Make it fail, check that it fails correctly
       (insert (decode-char 'ucs #x20AC)) ;; EURO SIGN
       (multiple-value-bind (query-coding-succeeded query-coding-table)
           (query-coding-region (point-min) (point-max) 'iso-8859-1-unix)
-        (unless (and (null query-coding-succeeded)
-                     (equal query-coding-table
-                            #s(range-table type start-closed-end-open data
-                                           ((257 258) t))))
-          (q-c-debug "dealing with %S" 'iso-8859-1-unix)
-          (q-c-debug "query-coding-succeeded not null, query-coding-table \
-%S" query-coding-table))
         (Assert (null query-coding-succeeded))
         (Assert (equal query-coding-table
                        #s(range-table type start-closed-end-open data
＠＠ -153,12 +126,6 ＠＠ null query-coding-table) %S" (eq t query
         ;; Stupidly, this succeeds. The behaviour is compatible with
         ;; GNU, though, and we encourage people not to use
         ;; iso-latin-1-with-esc-unix anyway:
-
-        (unless (and query-coding-succeeded
-                     (null query-coding-table))
-          (q-c-debug "dealing with %S" 'iso-latin-1-with-esc-unix)
-          (q-c-debug "query-coding-succeeded %S, query-coding-table \
-%S" query-coding-succeeded query-coding-table))
         (Assert query-coding-succeeded)
         (Assert (null query-coding-table)))
       ;; Check that it errors correctly. 
＠＠ -186,13 +153,6 ＠＠ null query-coding-table) %S" (eq t query
       (insert ?\x80)
       (multiple-value-bind (query-coding-succeeded query-coding-table)
           (query-coding-region (point-min) (point-max) 'windows-1252-unix)
-        (unless (and (null query-coding-succeeded)
-                     (equal query-coding-table
-                            #s(range-table type start-closed-end-open data
-                                           ((257 258) t))))
-          (q-c-debug "dealing with %S" 'windows-1252-unix)
-          (q-c-debug "query-coding-succeeded not null, query-coding-table \
-%S" query-coding-table))
         (Assert (null query-coding-succeeded))
         (Assert (equal query-coding-table
                        #s(range-table type start-closed-end-open data
＠＠ -212,17 +172,6 ＠＠ null query-coding-table) %S" (eq t query
         (Assert (null query-coding-table)))
       (multiple-value-bind (query-coding-succeeded query-coding-table)
           (query-coding-region (point-min) (point-max) 'windows-1252-unix)
-        (unless (and (null query-coding-succeeded)
-                     (equal query-coding-table
-                            #s(range-table type start-closed-end-open
-                                           data ((129 131) t (132 133) t
-                                                 (139 140) t (141 146) t
-                                                 (155 156) t (157 161) t
-                                                 (162 170) t (173 176) t
-                                                 (178 187) t (189 192) t
-                                                 (193 257) t))))
-          (q-c-debug "query-coding-succeeded not null, query-coding-table \
-%S" query-coding-table))
         (Assert (null query-coding-succeeded))
         (Assert (equal query-coding-table
                        #s(range-table type start-closed-end-open
＠＠ -290,4 +239,68 ＠＠ null query-coding-table) %S" (eq t query
             (query-coding-region (point-min) 173 coding-system nil t)
           (text-conversion-error
            (setq text-conversion-error-signalled t)))
-        (Assert (null text-conversion-error-signalled))))))
+        (Assert (null text-conversion-error-signalled)))
+
+      ;; Now to test #'encode-coding-char. Most of the functionality was
+      ;; tested in the query-coding-region tests above, so we don't go into
+      ;; as much detail here.
+      (Assert (null (encode-coding-char
+                     (decode-char 'ucs #x20ac) 'iso-8859-1)))
+      (Assert (equal "\x80" (encode-coding-char 
+                             (decode-char 'ucs #x20ac) 'windows-1252)))
+      (delete-region (point-min) (point-max))
+
+      ;; And #'unencodable-char-position. 
+      (insert latin-1-chars-string)
+      (insert (decode-char 'ucs #x20ac))
+      (Assert (= 257 (unencodable-char-position (point-min) (point-max)
+                                                'iso-8859-1)))
+      (Assert (equal '(257) (unencodable-char-position (point-min) (point-max)
+                                                       'iso-8859-1 1)))
+      ;; Compatiblity, sigh: 
+      (Assert (equal '(257) (unencodable-char-position (point-min) (point-max)
+                                                       'iso-8859-1 0)))
+      (dotimes (i 6) (insert (decode-char 'ucs #x20ac)))
+      ;; Check if it stops at one:
+      (Assert (equal '(257) (unencodable-char-position (point-min) (point-max)
+                                                       'iso-8859-1 1)))
+      ;; Check if it stops at four:
+      (Assert (equal '(260 259 258 257)
+                     (unencodable-char-position (point-min) (point-max)
+                                                       'iso-8859-1 4)))
+      ;; Check whether it stops at seven: 
+      (Assert (equal '(263 262 261 260 259 258 257)
+                     (unencodable-char-position (point-min) (point-max)
+                                                       'iso-8859-1 7)))
+      ;; Check that it still stops at seven:
+      (Assert (equal '(263 262 261 260 259 258 257)
+                     (unencodable-char-position (point-min) (point-max)
+                                                       'iso-8859-1 2000)))
+      ;; Now, #'check-coding-systems-region. 
+      ;; UTF-8 should certainly be able to encode these characters:
+      (Assert (eq t (check-coding-systems-region (point-min) (point-max)
+                                                 '(utf-8))))
+      (Assert (equal '((iso-8859-1 257 258 259 260 261 262 263)
+                       (windows-1252 129 131 132 133 134 135 136 137 138 139
+                                     140 141 143 146 147 148 149 150 151 152
+                                     153 154 155 156 157 159 160))
+                       (sort
+                        (check-coding-systems-region (point-min) (point-max)
+                                                     '(utf-8 iso-8859-1
+                                                       windows-1252))
+                        ;; (The sort is to make the algorithm irrelevant.)
+                        #'(lambda (left right)
+                            (string< (car left) (car right))))))
+      ;; Ensure that the indices are all decreased by one when passed a
+      ;; string:
+      (Assert (equal '((iso-8859-1 256 257 258 259 260 261 262)
+                       (windows-1252 128 130 131 132 133 134 135 136 137 138
+                                     139 140 142 145 146 147 148 149 150 151
+                                     152 153 154 155 156 158 159))
+                     (sort
+                      (check-coding-systems-region (buffer-string) nil
+                                                   '(utf-8 iso-8859-1
+                                                     windows-1252))
+                      #'(lambda (left right)
+                          (string< (car left) (car right)))))))))
+


_______________________________________________
XEmacs-Patches mailing list
XEmacs-Patches(a)xemacs.org
http://calypso.tux.org/cgi-bin/mailman/listinfo/xemacs-patches

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

commit: Add check-coding-systems-region, test it and others, fix some bugs.