New subject: [COMMIT] (Revised2) Have VM figure out what MIME charset to use by coverage.

Friday, 1 April 2005

This version of the patch doesn’t load latin-unity or Mule-UCS. In the event
of some utf-8 coding system not being available, it still falls back to
iso-2022-jp. I think this is reasonable, if not ideal; Hrvoje, if you think
it isn’t, approve the patch and add a warning yourself that latin-unity and
perhaps Mule-UCS should be loaded to make it behave more ideally.

xemacs-packages/vm/ChangeLog addition:

2005-04-01  Aidan Kehoe  <kehoea(a)parhasard.net&gt;

	* vm-mime.el (vm-coding-system-priorities): New.
	* vm-mime.el (vm-get-coding-system-priorities): New.
	Add a list of preferred Mule coding systems for outgoing mail;
	write a function to access it, to allow us to have reasonable
	defaults--based on various packages' loaded state--in the event of
	it being nil.

	* vm-mime.el (vm-mime-ucs-list): New.
	* vm-mime.el (vm-get-mime-ucs-list): New.
	Add a list of Mule coding systems that can encode every character;
	use a function to access them, so we can check for utf-8 being
	available on lookup. 

	* vm-mime.el (vm-update-mime-charset-maps): New.
	Function to update the mule coding system -> Mime character set
	maps, called after load for un-define and latin-unity. 

        * vm-mime.el (vm-mime-charset-decode-region): If the TTY coding 
        system we're using can display a MIME charset, go ahead and decode 
        the message.  
        * vm-mime.el (vm-determine-proper-charset): Use latin-unity if 
        available, and other logic if not, to work out the appropriate 
        MIME character set to use for a region.  
        * vm-mime.el (vm-mime-tty-can-display-mime-charset): New. 
        * vm-mime.el (vm-mime-charset-internally-displayable-p): Add a 
        docstring, use vm-mime-tty-can-display-mime-charset now it's 
        available. 

XEmacs Packages source patch:
Diff command:   cvs -q diff -u
Files affected: xemacs-packages/vm/vm-mime.el

Index: xemacs-packages/vm/vm-mime.el
===================================================================
RCS file: /pack/xemacscvs/XEmacs/packages/xemacs-packages/vm/vm-mime.el,v
retrieving revision 1.29
diff -u -u -r1.29 vm-mime.el
--- xemacs-packages/vm/vm-mime.el	2003/09/03 00:43:45	1.29
+++ xemacs-packages/vm/vm-mime.el	2005/04/01 02:06:41
＠＠ -33,6 +33,80 ＠＠
   (put 'vm-mime-error 'error-conditions '(vm-mime-error error))
   (put 'vm-mime-error 'error-message "MIME error"))

+;; A lot of the more complicated MIME character set processing is only
+;; practical under MULE.
+
+(eval-when-compile 
+  (defvar vm-coding-system-priorities)
+  (defvar vm-mime-ucs-list)
+  (defvar latin-unity-ucs-list))
+
+(defcustom vm-coding-system-priorities nil
+  "*List of coding systems for VM to use, for outgoing mail, in order of
+preference. ")
+
+(defun vm-get-coding-system-priorities ()
+  "Return the value of `vm-coding-system-priorities', or a reasonable
+default for it if it's nil.  "
+  (if vm-coding-system-priorities
+      vm-coding-system-priorities
+    (let ((res '(iso-8859-1 iso-8859-2 iso-8859-15 iso-8859-15 utf-8)))
+      (dolist (list-item res)
+	;; Assumes iso-8859-1 is always available, which is reasonable.
+	(unless (coding-system-p (find-coding-system list-item))
+	  (delq list-item res)))
+      res)))
+
+(defcustom vm-mime-ucs-list nil 
+  "*List of coding systems that can encode all chars emacs knows.")
+
+(defun vm-get-mime-ucs-list ()
+  "Return the value of `vm-mime-ucs-list', or a reasonable default for it if
+it's nil.  This is used instead of `vm-mime-ucs-list' directly in order to
+allow runtime checks for optional features like `mule-ucs' or
+`latin-unity'.  "
+  (if vm-mime-ucs-list
+      vm-mime-ucs-list
+    (if (featurep 'latin-unity)
+	latin-unity-ucs-list
+      (if (coding-system-p (find-coding-system 'utf-8))
+	  '(utf-8 iso-2022-jp ctext escape-quoted)
+	'(iso-2022-jp ctext escape-quoted)))))
+
+(defun vm-update-mime-charset-maps ()
+  "Check for the presence of certain Mule coding systems, and add
+information about the corresponding MIME character sets to VM's
+configuration.  "
+  ;; Add some extra charsets that may not have been defined onto the end
+  ;; of vm-mime-mule-charset-to-coding-alist.
+  (mapcar (lambda (x)
+	    (and (coding-system-p (find-coding-system x))
+		 ;; Not using vm-string-assoc because of some quoting
+		 ;; weirdness it's doing. 
+		 (if (not (assoc
+			   (format "%s" x)
+			   vm-mime-mule-charset-to-coding-alist))
+		     (add-to-list 'vm-mime-mule-charset-to-coding-alist 
+				  (list (format "%s" x) x)))))
+	  '(utf-8 iso-8859-15 iso-8859-14 iso-8859-16))
+
+  ;; And make sure that the map back from coding-systems is good for
+  ;; those charsets.
+  (mapcar (lambda (x)
+	    (or (assoc (car (cdr x)) vm-mime-mule-coding-to-charset-alist)
+		(add-to-list 'vm-mime-mule-coding-to-charset-alist
+			     (list (car (cdr x)) (car x)))))
+	  vm-mime-mule-charset-to-coding-alist)
+  ;; Whoops, doesn't get picked up for some reason. 
+  (add-to-list 'vm-mime-mule-coding-to-charset-alist 
+	       '(iso-8859-1 "iso-8859-1")))
+
+(when vm-xemacs-mule-p
+  (unless (coding-system-p (find-coding-system 'utf-8))
+    (eval-after-load "un-define" `(vm-update-mime-charset-maps)))
+  (unless (featurep 'latin-unity)
+    (eval-after-load "latin-unity" `(vm-update-mime-charset-maps))))
+
 (defun vm-make-layout (&rest plist)
   (vector
    (plist-get plist 'type)
＠＠ -236,6 +310,7 ＠＠
   (cond ((or vm-xemacs-mule-p vm-fsfemacs-mule-p)
 	 (if (or (and vm-xemacs-p (memq (device-type) '(x gtk mswindows)))
 		 vm-fsfemacs-p
+		 (vm-mime-tty-can-display-mime-charset charset)
 		 nil)
 	     (let ((buffer-read-only nil)
 		   (cell (cdr (vm-string-assoc
＠＠ -1294,34 +1369,164 ＠＠

 (defvar buffer-file-coding-system)

+;; Possible further work; integrate with the FSF's unify-8859-on-encoding-mode
+;; stuff.
+
 (defun vm-determine-proper-charset (beg end)
+  "Work out what MIME character set to use for sending a message. 
+
+Uses `us-ascii' if the message is entirely ASCII compatible.  If MULE is not
+available, and the message contains contains non-ASCII characters, consults
+the variable `vm-mime-8bit-composition-charset' or uses `iso-8859-1.' if
+that is nil.
+
+Under MULE, `vm-coding-system-priorities' is searched, in order, for a
+coding system that will encode all the characters in the message. If none is
+found, `iso-2022-jp' is used, which will preserve information for all the
+character sets of which Emacs is aware--at the expense of being incompatible
+with the recipient's software, if that recipient is outside of East Asia."
   (save-excursion
     (save-restriction
       (narrow-to-region beg end)
-      (catch 'done
-	(goto-char (point-min))
 	(if (or vm-xemacs-mule-p 
 		(and vm-fsfemacs-mule-p enable-multibyte-characters))
-	    (let ((charsets (delq 'ascii (vm-charsets-in-region
-					  (point-min) (point-max)))))
-	      (cond ((null charsets)
-		     "us-ascii")
-		    ((cdr charsets)
-		     (or (car (cdr
-			       (assq (vm-coding-system-name
-				      buffer-file-coding-system)
-				     vm-mime-mule-coding-to-charset-alist)))
-			 "iso-2022-jp"))
-		    (t
-		     (or (car (cdr
-			       (assoc
-				(car charsets)
-				vm-mime-mule-charset-to-charset-alist)))
-			 "unknown"))))
-	  (and (re-search-forward "[^\000-\177]" nil t)
-	       (throw 'done (or vm-mime-8bit-composition-charset
-				"iso-8859-1")))
-	  (throw 'done vm-mime-7bit-composition-charset))))))
+	    ;; Okay, we're on a MULE build. 
+	    (let ((charsets (delq 'ascii
+				  (vm-charsets-in-region (point-min)
+							 (point-max)))))
+	      (cond 
+	       ;; No non-ASCII chars? Right, that makes it easy for us.
+	       ((null charsets) "us-ascii")
+
+	       ;; Check whether the buffer can be encoded using one of the
+	       ;; vm-coding-system-priorities coding systems.
+	       ((catch 'done
+
+		  ;; We can't really do this intelligently unless latin-unity
+		  ;; is available.
+		  (if (featurep 'latin-unity)
+		      (let ((csetzero charsets)
+			    ;; Check what latin character sets are in the
+			    ;; buffer.
+			    (csets (latin-unity-representations-feasible-region
+				    beg end))
+			    (psets (latin-unity-representations-present-region
+				    beg end))
+			    (systems (vm-get-coding-system-priorities)))
+
+			;; If one of the character sets is outside of latin
+			;; unity's remit, check for a universal character
+			;; set in vm-coding-system-priorities, and pass back
+			;; the first one.
+			;;
+			;; Otherwise, there's no remapping that latin unity
+			;; can do for us, and we should default to something
+			;; iso-2022 based. (Since we're not defaulting to
+			;; Unicode, at the moment.)
+
+			(while csetzero
+			  (if (not (memq 
+				    (car csetzero) latin-unity-character-sets))
+			      (let ((ucs-list (vm-get-mime-ucs-list))
+				    (preapproved
+				     (vm-get-coding-system-priorities)))
+				(while preapproved
+				  (if (memq (car preapproved) ucs-list)
+				      (throw 'done 
+					     (car (cdr (assq 
+							(vm-coding-system-name 
+							 (car preapproved))
+				      vm-mime-mule-coding-to-charset-alist)))))
+				  (setq preapproved (cdr preapproved)))
+				;; Nothing universal in the preapproved list.
+				(throw 'done nil)))
+			  (setq csetzero (cdr csetzero)))
+
+			;; Okay, we're able to remap using latin-unity. Do so.
+			(while systems
+			  (let ((sys (latin-unity-massage-name (car systems)
+					       'buffer-default)))
+			    (when (latin-unity-maybe-remap (point-min) 
+							   (point-max) sys 
+							   csets psets t)
+			      (throw 'done (second (assq 
+						    (vm-coding-system-name sys)
+				    vm-mime-mule-coding-to-charset-alist)))))
+			  (setq systems (cdr systems)))
+			(throw 'done nil))
+
+		    ;; Right, latin-unity isn't available.  If there's only
+		    ;; one non-ASCII character set in the region, and the
+		    ;; corresponding coding system is on the preapproved
+		    ;; list before the first universal character set, pass
+		    ;; it back. Otherwise, if a universal character set is
+		    ;; on the preapproved list, pass the first one of them
+		    ;; back. Otherwise, pass back nil and use the
+		    ;; "iso-2022-jp" entry below.
+
+		    (let ((csetzero charsets)
+			  (preapproved (vm-get-coding-system-priorities))
+			  (ucs-list (vm-get-mime-ucs-list)))
+		      (if (null (cdr csetzero))
+			  (while preapproved 
+			    ;; If we encounter a universal character set on
+			    ;; the preapproved list, pass it back.
+			    (if (memq (car preapproved) ucs-list)
+				(throw 'done (second (assq 
+						      (vm-coding-system-name
+						       (car preapproved))
+				     vm-mime-mule-coding-to-charset-alist))))
+
+			    ;; The preapproved entry isn't universal. Check if
+			    ;; it's related to the single non-ASCII MULE
+			    ;; charset in the buffer (that is, if the
+			    ;; conceptually unordered MULE list of characters
+			    ;; is based on a corresponding ISO character set,
+			    ;; and thus the ordered ISO character set can
+			    ;; encode all the characters in the MIME charset.)
+			    ;;
+			    ;; The string equivalence test is used because we
+			    ;; don't have another mapping that is useful
+			    ;; here. Nnngh.
+
+			    (if (string= 
+				 (car (cdr (assoc (car csetzero)
+				   vm-mime-mule-charset-to-charset-alist)))
+				 (car (cdr (assoc (car preapproved)
+				   vm-mime-mule-coding-to-charset-alist))))
+				(throw 'done 
+				       (car (cdr (assoc (car csetzero)
+				    vm-mime-mule-charset-to-charset-alist)))))
+			    (setq preapproved (cdr preapproved)))
+
+			;; Okay, there's more than one MULE character set in
+			;; the buffer. Check for a universal entry in the
+			;; preapproved list; if it exists pass it back,
+			;; otherwise fall through to the iso-2022-jp below,
+			;; because nothing on the preapproved list is
+			;; appropriate.
+
+			(while preapproved 
+			    ;; If we encounter a universal character set on
+			    ;; the preapproved list, pass it back.
+			    (when (memq (car preapproved) ucs-list)
+				(throw 'done (second (assq 
+						      (vm-coding-system-name
+						       (car preapproved))
+				     vm-mime-mule-coding-to-charset-alist))))
+			    (setq preapproved (cdr preapproved)))))
+		    (throw 'done nil))))
+	       ;; Couldn't do any magic with vm-coding-system-priorities. Pass
+	       ;; back a Japanese iso-2022 MIME character set.
+	       (t "iso-2022-jp")))
+	  ;; If we're non-MULE and there are eight bit characters, use a
+	  ;; sensible default.
+	  (goto-char (point-min))
+	  (if (re-search-forward "[^\000-\177]" nil t)
+	       (or vm-mime-8bit-composition-charset "iso-8859-1")
+	  ;; We're non-MULE and there are purely 7bit characters in the
+	  ;; region. Return vm-mime-7bit-c-c.
+	  vm-mime-7bit-composition-charset)))))

 (defun vm-determine-proper-content-transfer-encoding (beg end)
   (save-excursion
＠＠ -4059,7 +4264,39 ＠＠
   (or (vm-mime-types-match "text" (car (vm-mm-layout-type layout)))
       (vm-mime-types-match "message" (car (vm-mm-layout-type layout)))))

+
+(defun vm-mime-tty-can-display-mime-charset (name)
+  "Can the current TTY correctly display the given MIME character set?"
+  (and (fboundp 'console-tty-output-coding-system)
+       ;; Is this check too paranoid?
+       (coding-system-p (console-tty-output-coding-system))
+       (let 
+	   ;; Nnngh, latin-unity-base-name isn't doing the right thing for
+	   ;; me with MULE-UCS and UTF-8 as the terminal coding system. Of
+	   ;; course, it's not evident that it _can_ do the right thing.
+	   ;;
+	   ;; The intention is that ourtermcs is the version of the
+	   ;; coding-system without line-ending information attached to its
+	   ;; end.
+	   ((ourtermcs (or (car 
+			    (coding-system-get
+			     (console-tty-output-coding-system)
+			     'alias-coding-systems))
+			   (console-tty-output-coding-system))))
+	 (or (eq ourtermcs (car 
+			    (cdr 
+			     (vm-string-assoc 
+			      name vm-mime-mule-charset-to-coding-alist))))
+	     ;; The vm-mime-mule-charset-to-coding-alist check is to make
+	     ;; sure it does the right thing with a nonsense MIME character
+	     ;; set name.
+	     (and (memq ourtermcs vm-mime-ucs-list)
+		  (vm-string-assoc name vm-mime-mule-charset-to-coding-alist) 
+		  t)
+	     (vm-mime-default-face-charset-p name)))))
+
 (defun vm-mime-charset-internally-displayable-p (name)
+  "Can the given MIME charset be displayed within emacs by by VM?"
   (cond ((and vm-xemacs-mule-p (memq (device-type) '(x gtk mswindows)))
 	 (or (vm-string-assoc name vm-mime-mule-charset-to-coding-alist)
 	     (vm-mime-default-face-charset-p name)))
＠＠ -4069,6 +4306,13 ＠＠
 	((vm-multiple-fonts-possible-p)
 	 (or (vm-mime-default-face-charset-p name)
 	     (vm-string-assoc name vm-mime-charset-font-alist)))
+
+	;; If the terminal-coding-system variable is set to something that
+	;; can encode all the characters of the given MIME character set,
+	;; then we can display any message in the given MIME character set
+	;; internally.
+
+	((vm-mime-tty-can-display-mime-charset name))
 	(t
 	 (vm-mime-default-face-charset-p name))))

＠＠ -4975,10 +5219,32 ＠＠
 	   (setq buffer-undo-list (primitive-undo 1 buffer-undo-list))))))

 (defvar enriched-mode)
+
+;; Non-XEmacs specific changes to this function should be made to
+;; vm-mime-fsfemacs-encode-composition as well.

-;; Non-XEmacs specific changes to this function should be
-;; made to vm-mime-fsfemacs-encode-composition as well.
 (defun vm-mime-xemacs-encode-composition ()
+  "Encode the current message using MIME.
+
+The Multipurpose Internet Message Extensions extend the original format of
+Internet mail to allow non-US-ASCII textual messages, non-textual messages,
+multipart message bodies, and non-US-ASCII information in message headers.
+
+This function chooses the MIME character set(s) to use, and transforms the
+message content from the XEmacs-internal encoding to the corresponding
+octets in that MIME character set.
+
+It then applies some transfer encoding to the message. For details of the
+transfer encodings available, see the documentation for
+`vm-mime-8bit-text-transfer-encoding.'
+
+Finally, it creates the headers that are necessary to identify the message
+as one that uses MIME.
+
+Under MULE, it explicitly sets `buffer-file-coding-system' to a binary
+(no-transformation) coding system, to avoid further transformation of the
+message content when it's passed to the MTA (that is, the mail transfer
+agent; under Unix, normally sendmail.)"
   (save-restriction
     (widen)
     (if (not (eq major-mode 'mail-mode))
＠＠ -4992,6 +5258,8 ＠＠
 	  forward-local-refs already-mimed layout e e-list boundary
 	  type encoding charset params description disposition object
 	  opoint-min)
+      ;; Make sure we don't double encode UTF-8 (for example) text.
+      (setq buffer-file-coding-system (vm-binary-coding-system))
       (mail-text)
       (setq e-list (extent-list nil (point) (point-max))
 	    e-list (vm-delete (function
＠＠ -5020,11 +5288,26 ＠＠
 	    (if enriched
 		(let ((enriched-initial-annotation ""))
 		  (enriched-encode (point-min) (point-max))))
+
 	    (setq charset (vm-determine-proper-charset (point-min)
 						       (point-max)))
 	    (if vm-xemacs-mule-p
-		(encode-coding-region (point-min) (point-max)
-				      buffer-file-coding-system))
+		(encode-coding-region 
+		 (point-min) (point-max)
+
+		 ;; What about the case where vm-m-m-c-t-c-a doesn't have an
+		 ;; entry for the given charset? That shouldn't happen, if
+		 ;; vm-mime-mule-coding-to-charset-alist and
+		 ;; vm-mime-mule-charset-to-coding-alist have complete and
+		 ;; matching entries. Admittedly this last is not a
+		 ;; given. Should we make it so on startup? (By setting the
+		 ;; key for any missing entries in
+		 ;; vm-mime-mule-coding-to-charset-alist to being (format
+		 ;; "%s" coding-system), if necessary.)
+
+		 (car (cdr (vm-string-assoc 
+			    charset vm-mime-mule-charset-to-coding-alist)))))
+
 	    (setq encoding (vm-determine-proper-content-transfer-encoding
 			    (point-min)
 			    (point-max))
＠＠ -5058,8 +5341,23 ＠＠
 	    (setq charset (vm-determine-proper-charset (point-min)
 						       (point-max)))
 	    (if vm-xemacs-mule-p
-		(encode-coding-region (point-min) (point-max)
-				      buffer-file-coding-system))
+		(encode-coding-region 
+		 (point-min) (point-max)
+
+		 ;; What about the case where vm-m-m-c-t-c-a doesn't have an
+		 ;; entry for the given charset? That shouldn't happen, if
+		 ;; vm-mime-mule-coding-to-charset-alist and
+		 ;; vm-mime-mule-charset-to-coding-alist have complete and
+
+		 ;; matching entries. Admittedly this last is not a
+		 ;; given. Should we make it so on startup? (By setting the
+		 ;; key for any missing entries in
+		 ;; vm-mime-mule-coding-to-charset-alist to being (format
+		 ;; "%s" coding-system), if necessary.)
+
+		 (car (cdr (vm-string-assoc 
+			    charset vm-mime-mule-charset-to-coding-alist)))))
+
 	    (setq encoding (vm-determine-proper-content-transfer-encoding
 			    (point-min)
 			    (point-max))
＠＠ -5220,8 +5518,22 ＠＠
 	  (setq charset (vm-determine-proper-charset (point)
 						     (point-max)))
 	  (if vm-xemacs-mule-p
-	      (encode-coding-region (point) (point-max)
-				    buffer-file-coding-system))
+	      (encode-coding-region 
+	       (point) (point-max)
+
+	       ;; What about the case where vm-m-m-c-t-c-a doesn't have an
+	       ;; entry for the given charset? That shouldn't happen, if
+	       ;; vm-mime-mule-coding-to-charset-alist and
+	       ;; vm-mime-mule-charset-to-coding-alist have complete and
+	       ;; matching entries. Admittedly this last is not a
+	       ;; given. Should we make it so on startup? (By setting the
+	       ;; key for any missing entries in
+	       ;; vm-mime-mule-coding-to-charset-alist to being (format "%s"
+	       ;; coding-system), if necessary.)
+
+		 (car (cdr (vm-string-assoc 
+			    charset vm-mime-mule-charset-to-coding-alist)))))
+
 	  (setq encoding (vm-determine-proper-content-transfer-encoding
 			  (point)
 			  (point-max))

-- 
“I, for instance, am gung-ho about open source because my family is being
held hostage in Rob Malda’s basement. But who fact-checks me, or Enderle,
when we say something in public? No-one!” -- Danny O’Brien

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

2006

2005

2004

2003

[PATCH] (Revised) Have VM figure out what MIME charset to use by coverage