]> code.delx.au - gnu-emacs/blob - lisp/gnus/rfc2047.el
Merge from emacs--devo--0
[gnu-emacs] / lisp / gnus / rfc2047.el
1 ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages
2
3 ;; Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004,
4 ;; 2005, 2006, 2007 Free Software Foundation, Inc.
5
6 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
8 ;; This file is part of GNU Emacs.
9
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
14
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
19
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
23 ;; Boston, MA 02110-1301, USA.
24
25 ;;; Commentary:
26
27 ;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part
28 ;; Three: Message Header Extensions for Non-ASCII Text".
29
30 ;;; Code:
31
32 (eval-when-compile
33 (require 'cl)
34 (defvar message-posting-charset)
35 (unless (fboundp 'with-syntax-table) ; not in Emacs 20
36 (defmacro with-syntax-table (table &rest body)
37 "Evaluate BODY with syntax table of current buffer set to TABLE.
38 The syntax table of the current buffer is saved, BODY is evaluated, and the
39 saved table is restored, even in case of an abnormal exit.
40 Value is what BODY returns."
41 (let ((old-table (make-symbol "table"))
42 (old-buffer (make-symbol "buffer")))
43 `(let ((,old-table (syntax-table))
44 (,old-buffer (current-buffer)))
45 (unwind-protect
46 (progn
47 (set-syntax-table ,table)
48 ,@body)
49 (save-current-buffer
50 (set-buffer ,old-buffer)
51 (set-syntax-table ,old-table))))))))
52
53 (require 'qp)
54 (require 'mm-util)
55 (require 'ietf-drums)
56 ;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus.
57 (require 'mail-prsvr)
58 (require 'base64)
59 (autoload 'mm-body-7-or-8 "mm-bodies")
60
61 (eval-and-compile
62 ;; Avoid gnus-util for mm- code.
63 (defalias 'rfc2047-point-at-bol
64 (if (fboundp 'point-at-bol)
65 'point-at-bol
66 'line-beginning-position))
67
68 (defalias 'rfc2047-point-at-eol
69 (if (fboundp 'point-at-eol)
70 'point-at-eol
71 'line-end-position)))
72
73 (defvar rfc2047-header-encoding-alist
74 '(("Newsgroups" . nil)
75 ("Followup-To" . nil)
76 ("Message-ID" . nil)
77 ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|\\(In-\\)?Reply-To\\|Sender\
78 \\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime)
79 (t . mime))
80 "*Header/encoding method alist.
81 The list is traversed sequentially. The keys can either be
82 header regexps or t.
83
84 The values can be:
85
86 1) nil, in which case no encoding is done;
87 2) `mime', in which case the header will be encoded according to RFC2047;
88 3) `address-mime', like `mime', but takes account of the rules for address
89 fields (where quoted strings and comments must be treated separately);
90 4) a charset, in which case it will be encoded as that charset;
91 5) `default', in which case the field will be encoded as the rest
92 of the article.")
93
94 (defvar rfc2047-charset-encoding-alist
95 '((us-ascii . nil)
96 (iso-8859-1 . Q)
97 (iso-8859-2 . Q)
98 (iso-8859-3 . Q)
99 (iso-8859-4 . Q)
100 (iso-8859-5 . B)
101 (koi8-r . B)
102 (iso-8859-7 . B)
103 (iso-8859-8 . B)
104 (iso-8859-9 . Q)
105 (iso-8859-14 . Q)
106 (iso-8859-15 . Q)
107 (iso-2022-jp . B)
108 (iso-2022-kr . B)
109 (gb2312 . B)
110 (gbk . B)
111 (gb18030 . B)
112 (big5 . B)
113 (cn-big5 . B)
114 (cn-gb . B)
115 (cn-gb-2312 . B)
116 (euc-kr . B)
117 (iso-2022-jp-2 . B)
118 (iso-2022-int-1 . B)
119 (viscii . Q))
120 "Alist of MIME charsets to RFC2047 encodings.
121 Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding,
122 quoted-printable and base64 respectively.")
123
124 (defvar rfc2047-encode-function-alist
125 '((Q . rfc2047-q-encode-string)
126 (B . rfc2047-b-encode-string)
127 (nil . identity))
128 "Alist of RFC2047 encodings to encoding functions.")
129
130 (defvar rfc2047-encode-encoded-words t
131 "Whether encoded words should be encoded again.")
132
133 ;;;
134 ;;; Functions for encoding RFC2047 messages
135 ;;;
136
137 (defun rfc2047-qp-or-base64 ()
138 "Return the type with which to encode the buffer.
139 This is either `base64' or `quoted-printable'."
140 (save-excursion
141 (let ((limit (min (point-max) (+ 2000 (point-min))))
142 (n8bit 0))
143 (goto-char (point-min))
144 (skip-chars-forward "\x20-\x7f\r\n\t" limit)
145 (while (< (point) limit)
146 (incf n8bit)
147 (forward-char 1)
148 (skip-chars-forward "\x20-\x7f\r\n\t" limit))
149 (if (or (< (* 6 n8bit) (- limit (point-min)))
150 ;; Don't base64, say, a short line with a single
151 ;; non-ASCII char when splitting parts by charset.
152 (= n8bit 1))
153 'quoted-printable
154 'base64))))
155
156 (defun rfc2047-narrow-to-field ()
157 "Narrow the buffer to the header on the current line."
158 (beginning-of-line)
159 (narrow-to-region
160 (point)
161 (progn
162 (forward-line 1)
163 (if (re-search-forward "^[^ \n\t]" nil t)
164 (rfc2047-point-at-bol)
165 (point-max))))
166 (goto-char (point-min)))
167
168 (defun rfc2047-field-value ()
169 "Return the value of the field at point."
170 (save-excursion
171 (save-restriction
172 (rfc2047-narrow-to-field)
173 (re-search-forward ":[ \t\n]*" nil t)
174 (buffer-substring-no-properties (point) (point-max)))))
175
176 (defun rfc2047-quote-special-characters-in-quoted-strings (&optional
177 encodable-regexp)
178 "Quote special characters with `\\'s in quoted strings.
179 Quoting will not be done in a quoted string if it contains characters
180 matching ENCODABLE-REGEXP."
181 (goto-char (point-min))
182 (let ((tspecials (concat "[" ietf-drums-tspecials "]"))
183 beg end)
184 (with-syntax-table (standard-syntax-table)
185 (while (search-forward "\"" nil t)
186 (setq beg (match-beginning 0))
187 (unless (eq (char-before beg) ?\\)
188 (goto-char beg)
189 (setq beg (1+ beg))
190 (condition-case nil
191 (progn
192 (forward-sexp)
193 (setq end (1- (point)))
194 (goto-char beg)
195 (if (and encodable-regexp
196 (re-search-forward encodable-regexp end t))
197 (goto-char (1+ end))
198 (save-restriction
199 (narrow-to-region beg end)
200 (while (re-search-forward tspecials nil 'move)
201 (if (eq (char-before) ?\\)
202 (if (looking-at tspecials) ;; Already quoted.
203 (forward-char)
204 (insert "\\"))
205 (goto-char (match-beginning 0))
206 (insert "\\")
207 (forward-char))))
208 (forward-char)))
209 (error
210 (goto-char beg))))))))
211
212 (defvar rfc2047-encoding-type 'address-mime
213 "The type of encoding done by `rfc2047-encode-region'.
214 This should be dynamically bound around calls to
215 `rfc2047-encode-region' to either `mime' or `address-mime'. See
216 `rfc2047-header-encoding-alist', for definitions.")
217
218 (defun rfc2047-encode-message-header ()
219 "Encode the message header according to `rfc2047-header-encoding-alist'.
220 Should be called narrowed to the head of the message."
221 (interactive "*")
222 (save-excursion
223 (goto-char (point-min))
224 (let (alist elem method)
225 (while (not (eobp))
226 (save-restriction
227 (rfc2047-narrow-to-field)
228 (setq method nil
229 alist rfc2047-header-encoding-alist)
230 (while (setq elem (pop alist))
231 (when (or (and (stringp (car elem))
232 (looking-at (car elem)))
233 (eq (car elem) t))
234 (setq alist nil
235 method (cdr elem))))
236 (if (not (rfc2047-encodable-p))
237 (prog2
238 (when (eq method 'address-mime)
239 (rfc2047-quote-special-characters-in-quoted-strings))
240 (if (and (eq (mm-body-7-or-8) '8bit)
241 (mm-multibyte-p)
242 (mm-coding-system-p
243 (car message-posting-charset)))
244 ;; 8 bit must be decoded.
245 (mm-encode-coding-region
246 (point-min) (point-max)
247 (mm-charset-to-coding-system
248 (car message-posting-charset))))
249 ;; No encoding necessary, but folding is nice
250 (when nil
251 (rfc2047-fold-region
252 (save-excursion
253 (goto-char (point-min))
254 (skip-chars-forward "^:")
255 (when (looking-at ": ")
256 (forward-char 2))
257 (point))
258 (point-max))))
259 ;; We found something that may perhaps be encoded.
260 (re-search-forward "^[^:]+: *" nil t)
261 (cond
262 ((eq method 'address-mime)
263 (rfc2047-encode-region (point) (point-max)))
264 ((eq method 'mime)
265 (let ((rfc2047-encoding-type 'mime))
266 (rfc2047-encode-region (point) (point-max))))
267 ((eq method 'default)
268 (if (and (featurep 'mule)
269 (if (boundp 'default-enable-multibyte-characters)
270 default-enable-multibyte-characters)
271 mail-parse-charset)
272 (mm-encode-coding-region (point) (point-max)
273 mail-parse-charset)))
274 ;; We get this when CC'ing messsages to newsgroups with
275 ;; 8-bit names. The group name mail copy just got
276 ;; unconditionally encoded. Previously, it would ask
277 ;; whether to encode, which was quite confusing for the
278 ;; user. If the new behaviour is wrong, tell me. I have
279 ;; left the old code commented out below.
280 ;; -- Per Abrahamsen <abraham@dina.kvl.dk> Date: 2001-10-07.
281 ;; Modified by Dave Love, with the commented-out code changed
282 ;; in accordance with changes elsewhere.
283 ((null method)
284 (rfc2047-encode-region (point) (point-max)))
285 ;;; ((null method)
286 ;;; (if (or (message-options-get
287 ;;; 'rfc2047-encode-message-header-encode-any)
288 ;;; (message-options-set
289 ;;; 'rfc2047-encode-message-header-encode-any
290 ;;; (y-or-n-p
291 ;;; "Some texts are not encoded. Encode anyway?")))
292 ;;; (rfc2047-encode-region (point-min) (point-max))
293 ;;; (error "Cannot send unencoded text")))
294 ((mm-coding-system-p method)
295 (if (and (featurep 'mule)
296 (if (boundp 'default-enable-multibyte-characters)
297 default-enable-multibyte-characters))
298 (mm-encode-coding-region (point) (point-max) method)))
299 ;; Hm.
300 (t)))
301 (goto-char (point-max)))))))
302
303 ;; Fixme: This, and the require below may not be the Right Thing, but
304 ;; should be safe just before release. -- fx 2001-02-08
305 (eval-when-compile (defvar message-posting-charset))
306
307 (defun rfc2047-encodable-p ()
308 "Return non-nil if any characters in current buffer need encoding in headers.
309 The buffer may be narrowed."
310 (require 'message) ; for message-posting-charset
311 (let ((charsets
312 (mm-find-mime-charset-region (point-min) (point-max))))
313 (goto-char (point-min))
314 (or (and rfc2047-encode-encoded-words
315 (prog1
316 (search-forward "=?" nil t)
317 (goto-char (point-min))))
318 (and charsets
319 (not (equal charsets (list (car message-posting-charset))))))))
320
321 ;; Use this syntax table when parsing into regions that may need
322 ;; encoding. Double quotes are string delimiters, backslash is
323 ;; character quoting, and all other RFC 2822 special characters are
324 ;; treated as punctuation so we can use forward-sexp/forward-word to
325 ;; skip to the end of regions appropriately. Nb. ietf-drums does
326 ;; things differently.
327 (defconst rfc2047-syntax-table
328 ;; (make-char-table 'syntax-table '(2)) only works in Emacs.
329 (let ((table (make-syntax-table)))
330 ;; The following is done to work for setting all elements of the table
331 ;; in Emacs 21-23 and XEmacs; it appears to be the cleanest way.
332 ;; Play safe and don't assume the form of the word syntax entry --
333 ;; copy it from ?a.
334 (if (fboundp 'set-char-table-range) ; Emacs
335 (funcall (intern "set-char-table-range")
336 table t (aref (standard-syntax-table) ?a))
337 (if (fboundp 'put-char-table)
338 (if (fboundp 'get-char-table) ; warning avoidance
339 (put-char-table t (get-char-table ?a (standard-syntax-table))
340 table))))
341 (modify-syntax-entry ?\\ "\\" table)
342 (modify-syntax-entry ?\" "\"" table)
343 (modify-syntax-entry ?\( "(" table)
344 (modify-syntax-entry ?\) ")" table)
345 (modify-syntax-entry ?\< "." table)
346 (modify-syntax-entry ?\> "." table)
347 (modify-syntax-entry ?\[ "." table)
348 (modify-syntax-entry ?\] "." table)
349 (modify-syntax-entry ?: "." table)
350 (modify-syntax-entry ?\; "." table)
351 (modify-syntax-entry ?, "." table)
352 (modify-syntax-entry ?@ "." table)
353 table))
354
355 (defun rfc2047-encode-region (b e)
356 "Encode words in region B to E that need encoding.
357 By default, the region is treated as containing RFC2822 addresses.
358 Dynamically bind `rfc2047-encoding-type' to change that."
359 (save-restriction
360 (narrow-to-region b e)
361 (let ((encodable-regexp (if rfc2047-encode-encoded-words
362 "[^\000-\177]+\\|=\\?"
363 "[^\000-\177]+"))
364 start ; start of current token
365 end begin csyntax
366 ;; Whether there's an encoded word before the current token,
367 ;; either immediately or separated by space.
368 last-encoded
369 (orig-text (buffer-substring-no-properties b e)))
370 (if (eq 'mime rfc2047-encoding-type)
371 ;; Simple case. Continuous words in which all those contain
372 ;; non-ASCII characters are encoded collectively. Encoding
373 ;; ASCII words, including `Re:' used in Subject headers, is
374 ;; avoided for interoperability with non-MIME clients and
375 ;; for making it easy to find keywords.
376 (progn
377 (goto-char (point-min))
378 (while (progn (skip-chars-forward " \t\n")
379 (not (eobp)))
380 (setq start (point))
381 (while (and (looking-at "[ \t\n]*\\([^ \t\n]+\\)")
382 (progn
383 (setq end (match-end 0))
384 (re-search-forward encodable-regexp end t)))
385 (goto-char end))
386 (if (> (point) start)
387 (rfc2047-encode start (point))
388 (goto-char end))))
389 ;; `address-mime' case -- take care of quoted words, comments.
390 (rfc2047-quote-special-characters-in-quoted-strings encodable-regexp)
391 (with-syntax-table rfc2047-syntax-table
392 (goto-char (point-min))
393 (condition-case err ; in case of unbalanced quotes
394 ;; Look for rfc2822-style: sequences of atoms, quoted
395 ;; strings, specials, whitespace. (Specials mustn't be
396 ;; encoded.)
397 (while (not (eobp))
398 ;; Skip whitespace.
399 (skip-chars-forward " \t\n")
400 (setq start (point))
401 (cond
402 ((not (char-after))) ; eob
403 ;; else token start
404 ((eq ?\" (setq csyntax (char-syntax (char-after))))
405 ;; Quoted word.
406 (forward-sexp)
407 (setq end (point))
408 ;; Does it need encoding?
409 (goto-char start)
410 (if (re-search-forward encodable-regexp end 'move)
411 ;; It needs encoding. Strip the quotes first,
412 ;; since encoded words can't occur in quotes.
413 (progn
414 (goto-char end)
415 (delete-backward-char 1)
416 (goto-char start)
417 (delete-char 1)
418 (when last-encoded
419 ;; There was a preceding quoted word. We need
420 ;; to include any separating whitespace in this
421 ;; word to avoid it getting lost.
422 (skip-chars-backward " \t")
423 ;; A space is needed between the encoded words.
424 (insert ? )
425 (setq start (point)
426 end (1+ end)))
427 ;; Adjust the end position for the deleted quotes.
428 (rfc2047-encode start (- end 2))
429 (setq last-encoded t)) ; record that it was encoded
430 (setq last-encoded nil)))
431 ((eq ?. csyntax)
432 ;; Skip other delimiters, but record that they've
433 ;; potentially separated quoted words.
434 (forward-char)
435 (setq last-encoded nil))
436 ((eq ?\) csyntax)
437 (error "Unbalanced parentheses"))
438 ((eq ?\( csyntax)
439 ;; Look for the end of parentheses.
440 (forward-list)
441 ;; Encode text as an unstructured field.
442 (let ((rfc2047-encoding-type 'mime))
443 (rfc2047-encode-region (1+ start) (1- (point))))
444 (skip-chars-forward ")"))
445 (t ; normal token/whitespace sequence
446 ;; Find the end.
447 ;; Skip one ASCII word, or encode continuous words
448 ;; in which all those contain non-ASCII characters.
449 (setq end nil)
450 (while (not (or end (eobp)))
451 (when (looking-at "[\000-\177]+")
452 (setq begin (point)
453 end (match-end 0))
454 (when (progn
455 (while (and (or (re-search-forward
456 "[ \t\n]\\|\\Sw" end 'move)
457 (setq end nil))
458 (eq ?\\ (char-syntax (char-before))))
459 ;; Skip backslash-quoted characters.
460 (forward-char))
461 end)
462 (setq end (match-beginning 0))
463 (if rfc2047-encode-encoded-words
464 (progn
465 (goto-char begin)
466 (when (search-forward "=?" end 'move)
467 (goto-char (match-beginning 0))
468 (setq end nil)))
469 (goto-char end))))
470 ;; Where the value nil of `end' means there may be
471 ;; text to have to be encoded following the point.
472 ;; Otherwise, the point reached to the end of ASCII
473 ;; words separated by whitespace or a special char.
474 (unless end
475 (when (looking-at encodable-regexp)
476 (goto-char (setq begin (match-end 0)))
477 (while (and (looking-at "[ \t\n]+\\([^ \t\n]+\\)")
478 (setq end (match-end 0))
479 (progn
480 (while (re-search-forward
481 encodable-regexp end t))
482 (< begin (point)))
483 (goto-char begin)
484 (or (not (re-search-forward "\\Sw" end t))
485 (progn
486 (goto-char (match-beginning 0))
487 nil)))
488 (goto-char end))
489 (when (looking-at "[^ \t\n]+")
490 (setq end (match-end 0))
491 (if (re-search-forward "\\Sw+" end t)
492 ;; There are special characters better
493 ;; to be encoded so that MTAs may parse
494 ;; them safely.
495 (cond ((= end (point)))
496 ((looking-at (concat "\\sw*\\("
497 encodable-regexp
498 "\\)"))
499 (setq end nil))
500 (t
501 (goto-char (1- (match-end 0)))
502 (unless (= (point) (match-beginning 0))
503 ;; Separate encodable text and
504 ;; delimiter.
505 (insert " "))))
506 (goto-char end)
507 (skip-chars-forward " \t\n")
508 (if (and (looking-at "[^ \t\n]+")
509 (string-match encodable-regexp
510 (match-string 0)))
511 (setq end nil)
512 (goto-char end)))))))
513 (skip-chars-backward " \t\n")
514 (setq end (point))
515 (goto-char start)
516 (if (re-search-forward encodable-regexp end 'move)
517 (progn
518 (unless (memq (char-before start) '(nil ?\t ? ))
519 (if (progn
520 (goto-char start)
521 (skip-chars-backward "^ \t\n")
522 (and (looking-at "\\Sw+")
523 (= (match-end 0) start)))
524 ;; Also encode bogus delimiters.
525 (setq start (point))
526 ;; Separate encodable text and delimiter.
527 (goto-char start)
528 (insert " ")
529 (setq start (1+ start)
530 end (1+ end))))
531 (rfc2047-encode start end)
532 (setq last-encoded t))
533 (setq last-encoded nil)))))
534 (error
535 (if (or debug-on-quit debug-on-error)
536 (signal (car err) (cdr err))
537 (error "Invalid data for rfc2047 encoding: %s"
538 (mm-replace-in-string orig-text "[ \t\n]+" " "))))))))
539 (rfc2047-fold-region b (point))
540 (goto-char (point-max))))
541
542 (defun rfc2047-encode-string (string)
543 "Encode words in STRING.
544 By default, the string is treated as containing addresses (see
545 `rfc2047-encoding-type')."
546 (mm-with-multibyte-buffer
547 (insert string)
548 (rfc2047-encode-region (point-min) (point-max))
549 (buffer-string)))
550
551 (defvar rfc2047-encode-max-chars 76
552 "Maximum characters of each header line that contain encoded-words.
553 If it is nil, encoded-words will not be folded. Too small value may
554 cause an error. Don't change this for no particular reason.")
555
556 (defun rfc2047-encode-1 (column string cs encoder start crest tail
557 &optional eword)
558 "Subroutine used by `rfc2047-encode'."
559 (cond ((string-equal string "")
560 (or eword ""))
561 ((not rfc2047-encode-max-chars)
562 (concat start
563 (funcall encoder (if cs
564 (mm-encode-coding-string string cs)
565 string))
566 "?="))
567 ((>= column rfc2047-encode-max-chars)
568 (when eword
569 (cond ((string-match "\n[ \t]+\\'" eword)
570 ;; Reomove a superfluous empty line.
571 (setq eword (substring eword 0 (match-beginning 0))))
572 ((string-match "(+\\'" eword)
573 ;; Break the line before the open parenthesis.
574 (setq crest (concat crest (match-string 0 eword))
575 eword (substring eword 0 (match-beginning 0))))))
576 (rfc2047-encode-1 (length crest) string cs encoder start " " tail
577 (concat eword "\n" crest)))
578 (t
579 (let ((index 0)
580 (limit (1- (length string)))
581 (prev "")
582 next len)
583 (while (and prev
584 (<= index limit))
585 (setq next (concat start
586 (funcall encoder
587 (if cs
588 (mm-encode-coding-string
589 (substring string 0 (1+ index))
590 cs)
591 (substring string 0 (1+ index))))
592 "?=")
593 len (+ column (length next)))
594 (if (> len rfc2047-encode-max-chars)
595 (setq next prev
596 prev nil)
597 (if (or (< index limit)
598 (<= (+ len (or (string-match "\n" tail)
599 (length tail)))
600 rfc2047-encode-max-chars))
601 (setq prev next
602 index (1+ index))
603 (if (string-match "\\`)+" tail)
604 ;; Break the line after the close parenthesis.
605 (setq tail (concat (substring tail 0 (match-end 0))
606 "\n "
607 (substring tail (match-end 0)))
608 prev next
609 index (1+ index))
610 (setq next prev
611 prev nil)))))
612 (if (> index limit)
613 (concat eword next tail)
614 (if (= 0 index)
615 (if (and eword
616 (string-match "(+\\'" eword))
617 (setq crest (concat crest (match-string 0 eword))
618 eword (substring eword 0 (match-beginning 0)))
619 (setq eword (concat eword next)))
620 (setq crest " "
621 eword (concat eword next)))
622 (when (string-match "\n[ \t]+\\'" eword)
623 ;; Reomove a superfluous empty line.
624 (setq eword (substring eword 0 (match-beginning 0))))
625 (rfc2047-encode-1 (length crest) (substring string index)
626 cs encoder start " " tail
627 (concat eword "\n" crest)))))))
628
629 (defun rfc2047-encode (b e)
630 "Encode the word(s) in the region B to E.
631 Point moves to the end of the region."
632 (let ((mime-charset (or (mm-find-mime-charset-region b e) (list 'us-ascii)))
633 cs encoding tail crest eword)
634 (cond ((> (length mime-charset) 1)
635 (error "Can't rfc2047-encode `%s'"
636 (buffer-substring-no-properties b e)))
637 ((= (length mime-charset) 1)
638 (setq mime-charset (car mime-charset)
639 cs (mm-charset-to-coding-system mime-charset))
640 (unless (and (mm-multibyte-p)
641 (mm-coding-system-p cs))
642 (setq cs nil))
643 (save-restriction
644 (narrow-to-region b e)
645 (setq encoding
646 (or (cdr (assq mime-charset
647 rfc2047-charset-encoding-alist))
648 ;; For the charsets that don't have a preferred
649 ;; encoding, choose the one that's shorter.
650 (if (eq (rfc2047-qp-or-base64) 'base64)
651 'B
652 'Q)))
653 (widen)
654 (goto-char e)
655 (skip-chars-forward "^ \t\n")
656 ;; `tail' may contain a close parenthesis.
657 (setq tail (buffer-substring-no-properties e (point)))
658 (goto-char b)
659 (setq b (point-marker)
660 e (set-marker (make-marker) e))
661 (rfc2047-fold-region (rfc2047-point-at-bol) b)
662 (goto-char b)
663 (skip-chars-backward "^ \t\n")
664 (unless (= 0 (skip-chars-backward " \t"))
665 ;; `crest' may contain whitespace and an open parenthesis.
666 (setq crest (buffer-substring-no-properties (point) b)))
667 (setq eword (rfc2047-encode-1
668 (- b (rfc2047-point-at-bol))
669 (mm-replace-in-string
670 (buffer-substring-no-properties b e)
671 "\n\\([ \t]?\\)" "\\1")
672 cs
673 (or (cdr (assq encoding
674 rfc2047-encode-function-alist))
675 'identity)
676 (concat "=?" (downcase (symbol-name mime-charset))
677 "?" (upcase (symbol-name encoding)) "?")
678 (or crest " ")
679 tail))
680 (delete-region (if (eq (aref eword 0) ?\n)
681 (if (bolp)
682 ;; The line was folded before encoding.
683 (1- (point))
684 (point))
685 (goto-char b))
686 (+ e (length tail)))
687 ;; `eword' contains `crest' and `tail'.
688 (insert eword)
689 (set-marker b nil)
690 (set-marker e nil)
691 (unless (or (/= 0 (length tail))
692 (eobp)
693 (looking-at "[ \t\n)]"))
694 (insert " "))))
695 (t
696 (goto-char e)))))
697
698 (defun rfc2047-fold-field ()
699 "Fold the current header field."
700 (save-excursion
701 (save-restriction
702 (rfc2047-narrow-to-field)
703 (rfc2047-fold-region (point-min) (point-max)))))
704
705 (defun rfc2047-fold-region (b e)
706 "Fold long lines in region B to E."
707 (save-restriction
708 (narrow-to-region b e)
709 (goto-char (point-min))
710 (let ((break nil)
711 (qword-break nil)
712 (first t)
713 (bol (save-restriction
714 (widen)
715 (rfc2047-point-at-bol))))
716 (while (not (eobp))
717 (when (and (or break qword-break)
718 (> (- (point) bol) 76))
719 (goto-char (or break qword-break))
720 (setq break nil
721 qword-break nil)
722 (skip-chars-backward " \t")
723 (if (looking-at "[ \t]")
724 (insert ?\n)
725 (insert "\n "))
726 (setq bol (1- (point)))
727 ;; Don't break before the first non-LWSP characters.
728 (skip-chars-forward " \t")
729 (unless (eobp)
730 (forward-char 1)))
731 (cond
732 ((eq (char-after) ?\n)
733 (forward-char 1)
734 (setq bol (point)
735 break nil
736 qword-break nil)
737 (skip-chars-forward " \t")
738 (unless (or (eobp) (eq (char-after) ?\n))
739 (forward-char 1)))
740 ((eq (char-after) ?\r)
741 (forward-char 1))
742 ((memq (char-after) '(? ?\t))
743 (skip-chars-forward " \t")
744 (unless first ;; Don't break just after the header name.
745 (setq break (point))))
746 ((not break)
747 (if (not (looking-at "=\\?[^=]"))
748 (if (eq (char-after) ?=)
749 (forward-char 1)
750 (skip-chars-forward "^ \t\n\r="))
751 ;; Don't break at the start of the field.
752 (unless (= (point) b)
753 (setq qword-break (point)))
754 (skip-chars-forward "^ \t\n\r")))
755 (t
756 (skip-chars-forward "^ \t\n\r")))
757 (setq first nil))
758 (when (and (or break qword-break)
759 (> (- (point) bol) 76))
760 (goto-char (or break qword-break))
761 (setq break nil
762 qword-break nil)
763 (if (or (> 0 (skip-chars-backward " \t"))
764 (looking-at "[ \t]"))
765 (insert ?\n)
766 (insert "\n "))
767 (setq bol (1- (point)))
768 ;; Don't break before the first non-LWSP characters.
769 (skip-chars-forward " \t")
770 (unless (eobp)
771 (forward-char 1))))))
772
773 (defun rfc2047-unfold-field ()
774 "Fold the current line."
775 (save-excursion
776 (save-restriction
777 (rfc2047-narrow-to-field)
778 (rfc2047-unfold-region (point-min) (point-max)))))
779
780 (defun rfc2047-unfold-region (b e)
781 "Unfold lines in region B to E."
782 (save-restriction
783 (narrow-to-region b e)
784 (goto-char (point-min))
785 (let ((bol (save-restriction
786 (widen)
787 (rfc2047-point-at-bol)))
788 (eol (rfc2047-point-at-eol)))
789 (forward-line 1)
790 (while (not (eobp))
791 (if (and (looking-at "[ \t]")
792 (< (- (rfc2047-point-at-eol) bol) 76))
793 (delete-region eol (progn
794 (goto-char eol)
795 (skip-chars-forward "\r\n")
796 (point)))
797 (setq bol (rfc2047-point-at-bol)))
798 (setq eol (rfc2047-point-at-eol))
799 (forward-line 1)))))
800
801 (defun rfc2047-b-encode-string (string)
802 "Base64-encode the header contained in STRING."
803 (base64-encode-string string t))
804
805 (defun rfc2047-q-encode-string (string)
806 "Quoted-printable-encode the header in STRING."
807 (mm-with-unibyte-buffer
808 (insert string)
809 (quoted-printable-encode-region
810 (point-min) (point-max) nil
811 ;; = (\075), _ (\137), ? (\077) are used in the encoded word.
812 ;; Avoid using 8bit characters.
813 ;; This list excludes `especials' (see the RFC2047 syntax),
814 ;; meaning that some characters in non-structured fields will
815 ;; get encoded when they con't need to be. The following is
816 ;; what it used to be.
817 ;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?"
818 ;;; "\010\012\014\040-\074\076\100-\136\140-\177")
819 "-\b\n\f !#-'*+0-9A-Z\\^`-~\d")
820 (subst-char-in-region (point-min) (point-max) ? ?_)
821 (buffer-string)))
822
823 (defun rfc2047-encode-parameter (param value)
824 "Return and PARAM=VALUE string encoded in the RFC2047-like style.
825 This is a replacement for the `rfc2231-encode-string' function.
826
827 When attaching files as MIME parts, we should use the RFC2231 encoding
828 to specify the file names containing non-ASCII characters. However,
829 many mail softwares don't support it in practice and recipients won't
830 be able to extract files with correct names. Instead, the RFC2047-like
831 encoding is acceptable generally. This function provides the very
832 RFC2047-like encoding, resigning to such a regrettable trend. To use
833 it, put the following line in your ~/.gnus.el file:
834
835 \(defalias 'mail-header-encode-parameter 'rfc2047-encode-parameter)
836 "
837 (let* ((rfc2047-encoding-type 'mime)
838 (rfc2047-encode-max-chars nil)
839 (string (rfc2047-encode-string value)))
840 (if (string-match (concat "[" ietf-drums-tspecials "]") string)
841 (format "%s=%S" param string)
842 (concat param "=" string))))
843
844 ;;;
845 ;;; Functions for decoding RFC2047 messages
846 ;;;
847
848 (eval-and-compile
849 (defconst rfc2047-encoded-word-regexp
850 "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(\\*[^?]+\\)?\
851 \\?\\(B\\|Q\\)\\?\\([!->@-~ ]*\\)\\?="))
852
853 (defvar rfc2047-quote-decoded-words-containing-tspecials nil
854 "If non-nil, quote decoded words containing special characters.")
855
856 (defvar rfc2047-allow-incomplete-encoded-text t
857 "*Non-nil means allow incomplete encoded-text in successive encoded-words.
858 Dividing of encoded-text in the place other than character boundaries
859 violates RFC2047 section 5, while we have a capability to decode it.
860 If it is non-nil, the decoder will decode B- or Q-encoding in each
861 encoded-word, concatenate them, and decode it by charset. Otherwise,
862 the decoder will fully decode each encoded-word before concatenating
863 them.")
864
865 (defun rfc2047-strip-backslashes-in-quoted-strings ()
866 "Strip backslashes in quoted strings. `\\\"' remains."
867 (goto-char (point-min))
868 (let (beg)
869 (with-syntax-table (standard-syntax-table)
870 (while (search-forward "\"" nil t)
871 (unless (eq (char-before) ?\\)
872 (setq beg (match-end 0))
873 (goto-char (match-beginning 0))
874 (condition-case nil
875 (progn
876 (forward-sexp)
877 (save-restriction
878 (narrow-to-region beg (1- (point)))
879 (goto-char beg)
880 (while (search-forward "\\" nil 'move)
881 (unless (memq (char-after) '(?\"))
882 (delete-backward-char 1))
883 (forward-char)))
884 (forward-char))
885 (error
886 (goto-char beg))))))))
887
888 (defun rfc2047-charset-to-coding-system (charset)
889 "Return coding-system corresponding to MIME CHARSET.
890 If your Emacs implementation can't decode CHARSET, return nil."
891 (when (stringp charset)
892 (setq charset (intern (downcase charset))))
893 (when (or (not charset)
894 (eq 'gnus-all mail-parse-ignored-charsets)
895 (memq 'gnus-all mail-parse-ignored-charsets)
896 (memq charset mail-parse-ignored-charsets))
897 (setq charset mail-parse-charset))
898 (let ((cs (mm-charset-to-coding-system charset)))
899 (cond ((eq cs 'ascii)
900 (setq cs (or (mm-charset-to-coding-system mail-parse-charset)
901 'raw-text)))
902 ((mm-coding-system-p cs))
903 ((and charset
904 (listp mail-parse-ignored-charsets)
905 (memq 'gnus-unknown mail-parse-ignored-charsets))
906 (setq cs (mm-charset-to-coding-system mail-parse-charset))))
907 (if (eq cs 'ascii)
908 'raw-text
909 cs)))
910
911 (defun rfc2047-decode-encoded-words (words)
912 "Decode successive encoded-words in WORDS and return a decoded string.
913 Each element of WORDS looks like (CHARSET ENCODING ENCODED-TEXT
914 ENCODED-WORD)."
915 (let (word charset cs encoding text rest)
916 (while words
917 (setq word (pop words))
918 (if (and (setq cs (rfc2047-charset-to-coding-system
919 (setq charset (car word))))
920 (condition-case code
921 (cond ((char-equal ?B (nth 1 word))
922 (setq text (base64-decode-string
923 (rfc2047-pad-base64 (nth 2 word)))))
924 ((char-equal ?Q (nth 1 word))
925 (setq text (quoted-printable-decode-string
926 (mm-subst-char-in-string
927 ?_ ? (nth 2 word) t)))))
928 (error
929 (message "%s" (error-message-string code))
930 nil)))
931 (if (and rfc2047-allow-incomplete-encoded-text
932 (eq cs (caar rest)))
933 ;; Concatenate text of which the charset is the same.
934 (setcdr (car rest) (concat (cdar rest) text))
935 (push (cons cs text) rest))
936 ;; Don't decode encoded-word.
937 (push (cons nil (nth 3 word)) rest)))
938 (while rest
939 (setq words (concat
940 (or (and (setq cs (caar rest))
941 (condition-case code
942 (mm-decode-coding-string (cdar rest) cs)
943 (error
944 (message "%s" (error-message-string code))
945 nil)))
946 (concat (when (cdr rest) " ")
947 (cdar rest)
948 (when (and words
949 (not (eq (string-to-char words) ? )))
950 " ")))
951 words)
952 rest (cdr rest)))
953 words))
954
955 ;; Fixme: This should decode in place, not cons intermediate strings.
956 ;; Also check whether it needs to worry about delimiting fields like
957 ;; encoding.
958
959 ;; In fact it's reported that (invalid) encoding of mailboxes in
960 ;; addr-specs is in use, so delimiting fields might help. Probably
961 ;; not decoding a word which isn't properly delimited is good enough
962 ;; and worthwhile (is it more correct or not?), e.g. something like
963 ;; `=?iso-8859-1?q?foo?=@'.
964
965 (defun rfc2047-decode-region (start end &optional address-mime)
966 "Decode MIME-encoded words in region between START and END.
967 If ADDRESS-MIME is non-nil, strip backslashes which precede characters
968 other than `\"' and `\\' in quoted strings."
969 (interactive "r")
970 (let ((case-fold-search t)
971 (eword-regexp (eval-when-compile
972 ;; Ignore whitespace between encoded-words.
973 (concat "[\n\t ]*\\(" rfc2047-encoded-word-regexp
974 "\\)")))
975 b e match words)
976 (save-excursion
977 (save-restriction
978 (narrow-to-region start end)
979 (when address-mime
980 (rfc2047-strip-backslashes-in-quoted-strings))
981 (goto-char (setq b start))
982 ;; Look for the encoded-words.
983 (while (setq match (re-search-forward eword-regexp nil t))
984 (setq e (match-beginning 1)
985 end (match-end 0)
986 words nil)
987 (while match
988 (push (list (match-string 2) ;; charset
989 (char-after (match-beginning 4)) ;; encoding
990 (match-string 5) ;; encoded-text
991 (match-string 1)) ;; encoded-word
992 words)
993 ;; Look for the subsequent encoded-words.
994 (when (setq match (looking-at eword-regexp))
995 (goto-char (setq end (match-end 0)))))
996 ;; Replace the encoded-words with the decoded one.
997 (delete-region e end)
998 (insert (rfc2047-decode-encoded-words (nreverse words)))
999 (save-restriction
1000 (narrow-to-region e (point))
1001 (goto-char e)
1002 ;; Remove newlines between decoded words, though such
1003 ;; things essentially must not be there.
1004 (while (re-search-forward "[\n\r]+" nil t)
1005 (replace-match " "))
1006 ;; Quote decoded words if there are special characters
1007 ;; which might violate RFC2822.
1008 (when (and rfc2047-quote-decoded-words-containing-tspecials
1009 (let ((regexp (car (rassq
1010 'address-mime
1011 rfc2047-header-encoding-alist))))
1012 (when regexp
1013 (save-restriction
1014 (widen)
1015 (beginning-of-line)
1016 (while (and (memq (char-after) '(? ?\t))
1017 (zerop (forward-line -1))))
1018 (looking-at regexp)))))
1019 (let (quoted)
1020 (goto-char e)
1021 (skip-chars-forward " \t")
1022 (setq start (point))
1023 (setq quoted (eq (char-after) ?\"))
1024 (goto-char (point-max))
1025 (skip-chars-backward " \t")
1026 (if (setq quoted (and quoted
1027 (> (point) (1+ start))
1028 (eq (char-before) ?\")))
1029 (progn
1030 (backward-char)
1031 (setq start (1+ start)
1032 end (point-marker)))
1033 (setq end (point-marker)))
1034 (goto-char start)
1035 (while (search-forward "\"" end t)
1036 (when (prog2
1037 (backward-char)
1038 (zerop (% (skip-chars-backward "\\\\") 2))
1039 (goto-char (match-beginning 0)))
1040 (insert "\\"))
1041 (forward-char))
1042 (when (and (not quoted)
1043 (progn
1044 (goto-char start)
1045 (re-search-forward
1046 (concat "[" ietf-drums-tspecials "]")
1047 end t)))
1048 (goto-char start)
1049 (insert "\"")
1050 (goto-char end)
1051 (insert "\""))
1052 (set-marker end nil)))
1053 (goto-char (point-max)))
1054 (when (and (mm-multibyte-p)
1055 mail-parse-charset
1056 (not (eq mail-parse-charset 'us-ascii))
1057 (not (eq mail-parse-charset 'gnus-decoded)))
1058 (mm-decode-coding-region b e mail-parse-charset))
1059 (setq b (point)))
1060 (when (and (mm-multibyte-p)
1061 mail-parse-charset
1062 (not (eq mail-parse-charset 'us-ascii))
1063 (not (eq mail-parse-charset 'gnus-decoded)))
1064 (mm-decode-coding-region b (point-max) mail-parse-charset))))))
1065
1066 (defun rfc2047-decode-address-region (start end)
1067 "Decode MIME-encoded words in region between START and END.
1068 Backslashes which precede characters other than `\"' and `\\' in quoted
1069 strings are stripped."
1070 (rfc2047-decode-region start end t))
1071
1072 (defun rfc2047-decode-string (string &optional address-mime)
1073 "Decode MIME-encoded STRING and return the result.
1074 If ADDRESS-MIME is non-nil, strip backslashes which precede characters
1075 other than `\"' and `\\' in quoted strings."
1076 (let ((m (mm-multibyte-p)))
1077 (if (string-match "=\\?" string)
1078 (with-temp-buffer
1079 ;; Fixme: This logic is wrong, but seems to be required by
1080 ;; Gnus summary buffer generation. The value of `m' depends
1081 ;; on the current buffer, not global multibyteness or that
1082 ;; of the string. Also the string returned should always be
1083 ;; multibyte in a multibyte session, i.e. the buffer should
1084 ;; be multibyte before `buffer-string' is called.
1085 (when m
1086 (mm-enable-multibyte))
1087 (insert string)
1088 (inline
1089 (rfc2047-decode-region (point-min) (point-max) address-mime))
1090 (buffer-string))
1091 (when address-mime
1092 (setq string
1093 (with-temp-buffer
1094 (when (mm-multibyte-string-p string)
1095 (mm-enable-multibyte))
1096 (insert string)
1097 (rfc2047-strip-backslashes-in-quoted-strings)
1098 (buffer-string))))
1099 ;; Fixme: As above, `m' here is inappropriate.
1100 (if (and m
1101 mail-parse-charset
1102 (not (eq mail-parse-charset 'us-ascii))
1103 (not (eq mail-parse-charset 'gnus-decoded)))
1104 ;; `decode-coding-string' in Emacs offers a third optional
1105 ;; arg NOCOPY to avoid consing a new string if the decoding
1106 ;; is "trivial". Unfortunately it currently doesn't
1107 ;; consider anything else than a `nil' coding system
1108 ;; trivial.
1109 ;; `rfc2047-decode-string' is called multiple times for each
1110 ;; article during summary buffer generation, and we really
1111 ;; want to avoid unnecessary consing. So we bypass
1112 ;; `decode-coding-string' if the string is purely ASCII.
1113 (if (and (fboundp 'detect-coding-string)
1114 ;; string is purely ASCII
1115 (eq (detect-coding-string string t) 'undecided))
1116 string
1117 (mm-decode-coding-string string mail-parse-charset))
1118 (mm-string-as-multibyte string)))))
1119
1120 (defun rfc2047-decode-address-string (string)
1121 "Decode MIME-encoded STRING and return the result.
1122 Backslashes which precede characters other than `\"' and `\\' in quoted
1123 strings are stripped."
1124 (rfc2047-decode-string string t))
1125
1126 (defun rfc2047-pad-base64 (string)
1127 "Pad STRING to quartets."
1128 ;; Be more liberal to accept buggy base64 strings. If
1129 ;; base64-decode-string accepts buggy strings, this function could
1130 ;; be aliased to identity.
1131 (if (= 0 (mod (length string) 4))
1132 string
1133 (when (string-match "=+$" string)
1134 (setq string (substring string 0 (match-beginning 0))))
1135 (case (mod (length string) 4)
1136 (0 string)
1137 (1 string) ;; Error, don't pad it.
1138 (2 (concat string "=="))
1139 (3 (concat string "=")))))
1140
1141 (provide 'rfc2047)
1142
1143 ;;; arch-tag: a07fe3d4-22b5-4c4a-bd89-b1f82d5d36f6
1144 ;;; rfc2047.el ends here