/[MIME-EncWords]/tags/MIME-EncWords-1.005/EncWords.pm
ViewVC logotype

Contents of /tags/MIME-EncWords-1.005/EncWords.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 35 - (show annotations) (download)
Sun Mar 16 14:28:49 2008 UTC (2 years, 5 months ago) by hatukanezumi
File size: 28101 byte(s)
Release 1.005
1
2 package MIME::EncWords;
3 use 5.005;
4
5 =head1 NAME
6
7 MIME::EncWords - deal with RFC 2047 encoded words (improved)
8
9 =head1 SYNOPSIS
10
11 I<L<MIME::EncWords> is aimed to be another implimentation
12 of L<MIME::Words> so that it will achive more exact conformance with
13 RFC 2047 (former RFC 1522) specifications. Additionally, it contains
14 some improvements.
15 Following synopsis and descriptions are inherited from its inspirer,
16 then added descriptions on improvements (B<**>) or changes and
17 clarifications (B<*>).>
18
19 Before reading further, you should see L<MIME::Tools> to make sure that
20 you understand where this module fits into the grand scheme of things.
21 Go on, do it now. I'll wait.
22
23 Ready? Ok...
24
25 use MIME::EncWords qw(:all);
26
27 ### Decode the string into another string, forgetting the charsets:
28 $decoded = decode_mimewords(
29 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
30 );
31
32 ### Split string into array of decoded [DATA,CHARSET] pairs:
33 @decoded = decode_mimewords(
34 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
35 );
36
37 ### Encode a single unsafe word:
38 $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
39
40 ### Encode a string, trying to find the unsafe words inside it:
41 $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
42
43 =head1 DESCRIPTION
44
45 Fellow Americans, you probably won't know what the hell this module
46 is for. Europeans, Russians, et al, you probably do. C<:-)>.
47
48 For example, here's a valid MIME header you might get:
49
50 From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
51 To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
52 CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>
53 Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
54 =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
55 =?US-ASCII?Q?.._cool!?=
56
57 The fields basically decode to (sorry, I can only approximate the
58 Latin characters with 7 bit sequences /o and 'e):
59
60 From: Keith Moore <moore@cs.utk.edu>
61 To: Keld J/orn Simonsen <keld@dkuug.dk>
62 CC: Andr'e Pirard <PIRARD@vm1.ulg.ac.be>
63 Subject: If you can read this you understand the example... cool!
64
65 B<Supplement>: Fellow Americans, Europeans, you probably won't know
66 what the hell this module is for. East Asians, et al, you probably do.
67 C<:-)>.
68
69 For example, here's a valid MIME header you might get:
70
71 Subject: =?EUC-KR?B?sNTAuLinKGxhemluZXNzKSwgwvzB9ri7seIoaW1w?=
72 =?EUC-KR?B?YXRpZW5jZSksILGzuLgoaHVicmlzKQ==?=
73
74 The fields basically decode to (sorry, I cannot approximate the
75 non-Latin multibyte characters with any 7 bit sequences):
76
77 Subject: ???(laziness), ????(impatience), ??(hubris)
78
79 =head1 PUBLIC INTERFACE
80
81 =over 4
82
83 =cut
84
85 ### Pragmas:
86 use strict;
87 use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
88
89 ### Exporting:
90 use Exporter;
91
92 %EXPORT_TAGS = (all => [qw(decode_mimewords
93 encode_mimeword
94 encode_mimewords)]);
95 Exporter::export_ok_tags(qw(all));
96
97 ### Inheritance:
98 @ISA = qw(Exporter);
99
100 ### Other modules:
101 use Carp qw(croak);
102 use MIME::Base64;
103 use MIME::Charset qw(:trans);
104
105 my @ENCODE_SUBS = qw(FB_CROAK decode encode is_utf8 resolve_alias);
106 if (MIME::Charset::USE_ENCODE) {
107 eval "use ".MIME::Charset::USE_ENCODE." \@ENCODE_SUBS;";
108 } else {
109 require MIME::Charset::_Compat;
110 for my $sub (@ENCODE_SUBS) {
111 no strict "refs";
112 *{$sub} = \&{"MIME::Charset::_Compat::$sub"};
113 }
114 }
115
116 #------------------------------
117 #
118 # Globals...
119 #
120 #------------------------------
121
122 ### The package version, both in 1.23 style *and* usable by MakeMaker:
123 $VERSION = '1.005';
124
125 ### Public Configuration Attributes
126 our $Config = {
127 %{$MIME::Charset::Config}, # Detect7bit, Replacement, Mapping
128 Charset => 'ISO-8859-1',
129 Encoding => 'A',
130 Field => undef,
131 MaxLineLen => 76,
132 Minimal => 'YES',
133 };
134 eval { require MIME::EncWords::Defaults; };
135
136 ### Private Constants
137
138 ### Nonprintables (controls + x7F + 8bit):
139 #my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
140 my $PRINTABLE = "\\x21-\\x7E";
141 my $NONPRINT = qr{[^$PRINTABLE]}; # Improvement: Unicode support.
142 my $UNSAFE = qr{[^\x01-\x20$PRINTABLE]};
143 my $WIDECHAR = qr{[^\x00-\xFF]};
144
145 #------------------------------
146
147 # _decode_B STRING
148 # Private: used by _decode_header() to decode "B" encoding.
149 # Improvement by this module: sanity check on encoded sequence.
150 sub _decode_B {
151 my $str = shift;
152 unless ((length($str) % 4 == 0) and
153 $str =~ m|^[A-Za-z0-9+/]+={0,2}$|) {
154 return undef;
155 }
156 return decode_base64($str);
157 }
158
159 # _decode_Q STRING
160 # Private: used by _decode_header() to decode "Q" encoding, which is
161 # almost, but not exactly, quoted-printable. :-P
162 sub _decode_Q {
163 my $str = shift;
164 $str =~ s/_/\x20/g; # RFC-1522, Q rule 2
165 $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
166 $str;
167 }
168
169 # _encode_B STRING
170 # Private: used by encode_mimeword() to encode "B" encoding.
171 sub _encode_B {
172 my $str = shift;
173 encode_base64($str, '');
174 }
175
176 # _encode_Q STRING
177 # Private: used by encode_mimeword() to encode "Q" encoding, which is
178 # almost, but not exactly, quoted-printable. :-P
179 # Improvement by this module: Spaces are escaped by ``_''.
180 sub _encode_Q {
181 my $str = shift;
182 # $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
183 $str =~ s{(\x20)|([_?=]|$NONPRINT)}{
184 defined $1? "_": sprintf("=%02X", ord($2))
185 }eog;
186 $str;
187 }
188
189 #------------------------------
190
191 =item decode_mimewords ENCODED, [OPTS...]
192
193 I<Function.>
194 Go through the string looking for RFC-1522-style "Q"
195 (quoted-printable, sort of) or "B" (base64) encoding, and decode them.
196
197 B<In an array context,> splits the ENCODED string into a list of decoded
198 C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
199 data are returned in a 1-element array C<[DATA]>, giving an effective
200 CHARSET of C<undef>.
201
202 $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
203 foreach (decode_mimewords($enc)) {
204 print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n";
205 }
206
207 B<**>
208 However, adjacent encoded-words with same charset will be concatenated
209 to handle multibyte sequences safely.
210
211 B<*>
212 Whitespaces surrounding unencoded data will not be stripped so that
213 compatibility with L<MIME::Words> will be ensured.
214
215 B<In a scalar context,> joins the "data" elements of the above
216 list together, and returns that. I<Warning: this is information-lossy,>
217 and probably I<not> what you want, but if you know that all charsets
218 in the ENCODED string are identical, it might be useful to you.
219 (Before you use this, please see L<MIME::WordDecoder/unmime>,
220 which is probably what you want.)
221 B<**>
222 See also "Charset" option below.
223
224 In the event of a syntax error, $@ will be set to a description
225 of the error, but parsing will continue as best as possible (so as to
226 get I<something> back when decoding headers).
227 $@ will be false if no error was detected.
228
229 B<*>
230 Malformed base64 encoded-words will be kept encoded.
231 In this case $@ will be set.
232
233 Any arguments past the ENCODED string are taken to define a hash of options.
234 B<**>
235 When Unicode/multibyte support is disabled
236 (see L<MIME::Charset/USE_ENCODE>),
237 these options will not have any effects.
238
239 =over 4
240
241 =item Charset
242 B<**>
243
244 Name of character set by which data elements in scalar context
245 will be converted.
246 If this option is specified as special value C<"_UNICODE_">,
247 returned value will be Unicode string.
248
249 B<Note>:
250 This feature is still information-lossy, I<except> when C<"_UNICODE_"> is
251 specified.
252
253 =item Detect7bit
254 B<**>
255
256 Try to detect 7-bit charset on unencoded portions.
257 Default is C<"YES">.
258
259 =cut
260
261 #=item Field
262 #
263 #Name of the mail field this string came from. I<Currently ignored.>
264
265 =item Mapping
266 B<**>
267
268 In scalar context, specify mappings actually used for charset names.
269 C<"EXTENDED"> uses extended mappings.
270 C<"STANDARD"> uses standardized strict mappings.
271 Default is C<"EXTENDED">.
272
273 =back
274
275 =cut
276
277 sub decode_mimewords {
278 my $encstr = shift;
279 my %params = @_;
280 my $cset = $params{"Charset"} || $Config->{Charset};
281 my $detect7bit = uc($params{'Detect7bit'} || $Config->{Detect7bit});
282 my $mapping = uc($params{'Mapping'} || $Config->{Mapping});
283 $cset = MIME::Charset->new($cset, Mapping => $mapping);
284 my @tokens;
285 $@ = ''; ### error-return
286
287 ### Decode:
288 my ($word, $charset, $language, $encoding, $enc, $dec);
289 my $spc = '';
290 pos($encstr) = 0;
291 while (1) {
292 last if (pos($encstr) >= length($encstr));
293 my $pos = pos($encstr); ### save it
294
295 ### Case 1: are we looking at "=?..?..?="?
296 if ($encstr =~ m{\G # from where we left off..
297 =\?([^?]*) # "=?" + charset +
298 \?([bq]) # "?" + encoding +
299 \?([^?]+) # "?" + data maybe with spcs +
300 \?= # "?="
301 ([\r\n\t ]*)
302 }xgi) {
303 ($word, $charset, $encoding, $enc) = ($&, $1, lc($2), $3);
304 my $tspc = $4;
305 # unfold
306 $tspc =~ s/(?:\r?\n|\r)([\t ])/$1/g;
307 $tspc =~ s/\r?\n|\r/ /g;
308
309 # RFC 2231 section 5 extension
310 if ($charset =~ s/^([^\*]*)\*(.*)/$1/) {
311 $language = $2 || undef;
312 $charset ||= undef;
313 } else {
314 $language = undef;
315 }
316
317 if ($encoding eq 'q') {
318 $dec = _decode_Q($enc);
319 } else {
320 $dec = _decode_B($enc);
321 }
322 unless (defined $dec) {
323 $@ .= qq|Illegal sequence in "$word" (pos $pos)\n|;
324 push @tokens, [$spc.$word];
325 $spc = '';
326 next;
327 }
328
329 if (scalar(@tokens) and
330 lc($charset) eq lc($tokens[-1]->[1]) and
331 resolve_alias($charset) and
332 (!${tokens[-1]}[2] and !$language or
333 lc(${tokens[-1]}[2]) eq lc($language))) { # Concat words if possible.
334 $tokens[-1]->[0] .= $dec;
335 } elsif ($language) {
336 push @tokens, [$dec, $charset, $language];
337 } elsif ($charset) {
338 push @tokens, [$dec, $charset];
339 } else {
340 push @tokens, [$dec];
341 }
342 $spc = $tspc;
343 next;
344 }
345
346 ### Case 2: are we looking at a bad "=?..." prefix?
347 ### We need this to detect problems for case 3, which stops at "=?":
348 pos($encstr) = $pos; # reset the pointer.
349 if ($encstr =~ m{\G=\?}xg) {
350 $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
351 push @tokens, [$spc.'=?'];
352 $spc = '';
353 next;
354 }
355
356 ### Case 3: are we looking at ordinary text?
357 pos($encstr) = $pos; # reset the pointer.
358 if ($encstr =~ m{\G # from where we left off...
359 (.*? # shortest possible string,
360 \n*) # followed by 0 or more NLs,
361 (?=(\Z|=\?)) # terminated by "=?" or EOS
362 }xgs) {
363 length($1) or croak "MIME::EncWords: internal logic err: empty token\n";
364 push @tokens, [$spc.$1];
365 $spc = '';
366 next;
367 }
368
369 ### Case 4: bug!
370 croak "MIME::EncWords: unexpected case:\n($encstr) pos $pos\n\t".
371 "Please alert developer.\n";
372 }
373 push @tokens, [$spc] if $spc;
374
375 # Detect 7-bit charset
376 if ($detect7bit ne "NO") {
377 foreach my $t (@tokens) {
378 unless ($t->[1]) {
379 my $charset = &MIME::Charset::_detect_7bit_charset($t->[0]);
380 if ($charset and $charset ne &MIME::Charset::default()) {
381 $t->[1] = $charset;
382 }
383 }
384 }
385 }
386
387 return (wantarray ? @tokens : join('',map {
388 &_convert($_->[0], $_->[1], $cset, $mapping)
389 } @tokens));
390 }
391
392 #------------------------------
393
394 # _convert RAW, FROMCHARSET, TOCHARSET, MAPPING
395 # Private: used by decode_mimewords() to convert string by other charset
396 # or to decode to Unicode.
397 # When source charset is unknown and Unicode string is requested, at first
398 # try well-formed UTF-8 then fallback to ISO-8859-1 so that almost all
399 # non-ASCII bytes will be preserved.
400 sub _convert($$$$) {
401 my $s = shift;
402 my $charset = shift;
403 my $cset = shift;
404 my $mapping = shift;
405 return $s unless MIME::Charset::USE_ENCODE;
406 return $s unless $cset->as_string;
407
408 my $preserveerr = $@;
409
410 $charset = MIME::Charset->new($charset, Mapping => $mapping);
411 if ($charset->as_string and $charset->as_string eq $cset->as_string) {
412 $@ = $preserveerr;
413 return $s;
414 }
415 # build charset object to transform string from $charset to $cset.
416 $charset->{OutputCharset} = $cset->as_string;
417 $charset->{Encoder} = $cset->decoder;
418
419 my $converted = $s;
420 if (is_utf8($s) or $s =~ $WIDECHAR) {
421 if ($charset->output_charset ne "_UNICODE_") {
422 $converted = $charset->encode($s);
423 }
424 } elsif ($charset->output_charset eq "_UNICODE_") {
425 if (!$charset->decoder) {
426 if ($s =~ $UNSAFE) {
427 $@ = '';
428 eval {
429 $converted = decode("UTF-8", $converted, FB_CROAK());
430 };
431 if ($@) {
432 $converted = $s;
433 $converted = decode("ISO-8859-1", $converted);
434 }
435 }
436 } else {
437 $converted = $charset->decode($s);
438 }
439 } elsif ($charset->decoder) {
440 $converted = $charset->encode($s);
441 }
442
443 $@ = $preserveerr;
444 return $converted;
445 }
446
447 #------------------------------
448
449 =item encode_mimeword RAW, [ENCODING], [CHARSET]
450
451 I<Function.>
452 Encode a single RAW "word" that has unsafe characters.
453 The "word" will be encoded in its entirety.
454
455 ### Encode "<<Franc,ois>>":
456 $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
457
458 You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
459 B<**>
460 You may also specify it as ``special'' value: C<"S"> to choose shorter
461 one of either C<"Q"> or C<"B">.
462
463 You may specify the CHARSET, which defaults to C<iso-8859-1>.
464
465 B<*>
466 Spaces will be escaped with ``_'' by C<"Q"> encoding.
467
468 =cut
469
470 sub encode_mimeword {
471 my $word = shift;
472 my $encoding = uc(shift || 'Q'); # not overridden.
473 my $charset = uc(shift || 'ISO-8859-1'); # ditto.
474
475 my $encstr;
476 if ($encoding eq 'Q') {
477 $encstr = &_encode_Q($word);
478 } elsif ($encoding eq "S") {
479 my ($B, $Q) = (&_encode_B($word), &_encode_Q($word));
480 if (length($B) < length($Q)) {
481 $encoding = "B";
482 $encstr = $B;
483 } else {
484 $encoding = "Q";
485 $encstr = $Q;
486 }
487 } else { # "B"
488 $encoding = "B";
489 $encstr = &_encode_B($word);
490 }
491
492 "=?$charset?$encoding?$encstr?=";
493 }
494
495 #------------------------------
496
497 =item encode_mimewords RAW, [OPTS]
498
499 I<Function.>
500 Given a RAW string, try to find and encode all "unsafe" sequences
501 of characters:
502
503 ### Encode a string with some unsafe "words":
504 $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
505
506 Returns the encoded string.
507
508 B<**>
509 RAW may be a Unicode string when Unicode/multibyte support is enabled
510 (see L<MIME::Charset/USE_ENCODE>).
511 Furthermore, RAW may be a reference to that returned
512 by L<"decode_mimewords"> on array context. In latter case "Charset"
513 option (see below) will be overridden (see also a note below).
514
515 B<Note>:
516 B<*>
517 When RAW is an arrayref,
518 adjacent encoded-words (i.e. elements having non-ASCII charset element)
519 are concatenated. Then they are splitted taking
520 care of character boundaries of multibyte sequences when Unicode/multibyte
521 support is enabled.
522 Portions for unencoded data should include surrounding whitespace(s), or
523 they will be merged into adjoining encoded-word(s).
524
525 Any arguments past the RAW string are taken to define a hash of options:
526
527 =over 4
528
529 =item Charset
530
531 Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
532 a.k.a. "Latin-1".
533
534 =item Detect7bit
535 B<**>
536
537 When "Encoding" option (see below) is specified as C<"a"> and "Charset"
538 option is unknown, try to detect 7-bit charset on given RAW string.
539 Default is C<"YES">.
540 When Unicode/multibyte support is disabled,
541 this option will not have any effects
542 (see L<MIME::Charset/USE_ENCODE>).
543
544 =item Encoding
545
546 The encoding to use, C<"q"> or C<"b">.
547 B<**>
548 You may also specify ``special'' values: C<"a"> will automatically choose
549 recommended encoding to use (with charset conversion if alternative
550 charset is recommended: see L<MIME::Charset>);
551 C<"s"> will choose shorter one of either C<"q"> or C<"b">.
552 B<NOTE>
553 As of release 1.005, The default was changed from C<"q">
554 (the default on MIME::Words) to C<"a">.
555
556 =item Field
557
558 Name of the mail field this string will be used in.
559 B<**>
560 Length of mail field name will be considered in the first line of
561 encoded header.
562
563 =item Mapping
564 B<**>
565
566 Specify mappings actually used for charset names.
567 C<"EXTENDED"> uses extended mappings.
568 C<"STANDARD"> uses standardized strict mappings.
569 Default is C<"EXTENDED">.
570
571 =item MaxLineLen
572 B<**>
573
574 Maximum line length excluding newline.
575 The default is 76.
576
577 =item Minimal
578 B<**>
579
580 Takes care of natural word separators (i.e. whitespaces)
581 in the text to be encoded.
582 If C<"NO"> is specified, this module will encode whole text
583 (if encoding needed) not regarding whitespaces;
584 encoded-words exceeding line length will be splitted based only on their
585 lengths.
586 Default is C<"YES">.
587
588 B<Note>:
589 As of release 0.040, default has been changed to C<"YES"> to ensure
590 compatibility with MIME::Words.
591 On earlier releases, this option was fixed to be C<"NO">.
592
593 =item Replacement
594 B<**>
595
596 See L<MIME::Charset/Error Handling>.
597
598 =back
599
600 =cut
601
602 sub encode_mimewords {
603 my $words = shift;
604 my %params = @_;
605 my $charset = uc($params{'Charset'}); # apply default later.
606 my $detect7bit = uc($params{'Detect7bit'} || $Config->{Detect7bit});
607 my $encoding = uc($params{'Encoding'} || $Config->{Encoding});
608 croak "MIME::EncWords: Unsupported encoding: $encoding"
609 unless $encoding =~ /^[ABQS]$/i;
610 my $field = $params{'Field'} || $Config->{Field} || "";
611 my $mapping = uc($params{'Mapping'} || $Config->{Mapping});
612 my $maxlinelen = $params{'MaxLineLen'};
613 $maxlinelen = $Config->{MaxLineLen} unless defined $maxlinelen;
614 my $minimal = uc($params{'Minimal'} || $Config->{Minimal});
615 my $replacement = uc($params{'Replacement'} || $Config->{Replacement});
616
617 my $charsetobj = MIME::Charset->new($charset, Mapping => $mapping);
618 my $firstlinelen = $maxlinelen - ($field? length("$field: "): 0);
619
620 unless (ref($words) eq "ARRAY") {
621 # unfold
622 $words =~ s/(?:\r?\n|\r)([\t ])/$1/g;
623 $words =~ s/\r?\n|\r/ /g;
624 # split if required
625 if ($minimal eq "YES") {
626 my @words = map {[$_, $charset]} split(/((?:\A|[\t ])[\t \x21-\x7E]+(?:[\t ]|\Z))/, $words);
627 $words = \@words;
628 } else {
629 $words = [[$words, $charset]];
630 }
631 }
632
633 # Translate / concatenate words.
634 my @triplets;
635 foreach (@$words) {
636 my ($s, $cset) = @$_;
637 my $csetobj = MIME::Charset->new($cset, Mapping => $mapping);
638 my $enc;
639
640 next unless length($s);
641
642 # Unicode string should be encoded by given charset.
643 # Unsupported charset will be fallbacked to UTF-8.
644 if (is_utf8($s) or $s =~ $WIDECHAR) {
645 unless ($csetobj->decoder) {
646 if ($s !~ $UNSAFE) {
647 $cset = "US-ASCII";
648 } elsif ($replacement =~ /^(CROAK|STRICT)$/) {
649 croak "MIME::EncWords: unsupported charset: $cset\n";
650 } else {
651 $cset = "UTF-8";
652 }
653 }
654 $csetobj = MIME::Charset->new($cset, Mapping => $mapping);
655 if ($replacement =~ /^(CROAK|STRICT)$/) {
656 $s = $csetobj->decoder->encode($s, FB_CROAK());
657 } else {
658 $s = $csetobj->decoder->encode($s, 0);
659 }
660 }
661
662 # Determine charset and encoding.
663 if ($encoding eq "A") {
664 my $obj;
665 if ($cset) {
666 $obj = $csetobj;
667 } else {
668 $obj = $charsetobj;
669 }
670 ($s, $cset, $enc) =
671 $obj->header_encode($s,
672 Detect7bit => $detect7bit,
673 Replacement => $replacement);
674 $csetobj = MIME::Charset->new($cset, Mapping => $mapping);
675 } elsif ($cset ne "US-ASCII") {
676 $cset ||= uc($charset || $Config->{Charset});
677 $csetobj = MIME::Charset->new($cset, Mapping => $mapping);
678 my $u = $s;
679 eval {
680 $u = $cset->decode($u, 0);
681 };
682 if ($@ or $u =~ $UNSAFE) {
683 $enc = $encoding;
684 } else {
685 ($cset, $enc) = ("US-ASCII", undef);
686 $csetobj = MIME::Charset->new($cset, Mapping => $mapping);
687 }
688 }
689
690 # Now no charset transformations are needed, although specified
691 # charset name might be reserved.
692 $csetobj->{InputCharset} = $cset;
693 $csetobj->{Encoder} = $csetobj->decoder;
694 $csetobj->{OutputCharset} = $csetobj->as_string;
695
696 # Concatenate adjacent ``words'' so that multibyte sequences will
697 # be handled safely.
698 # Note: Encoded-word and unencoded text must not adjoin without
699 # separating whitespace(s).
700 if (scalar(@triplets)) {
701 my ($last, $lastenc, $lastcsetobj) = @{$triplets[-1]};
702 if (uc($lastcsetobj->as_string) eq uc($csetobj->as_string) and
703 uc($lastenc) eq uc($enc) and
704 $csetobj->decoder) {
705 $triplets[-1]->[0] .= $s;
706 next;
707 } elsif (!$lastenc and $enc and $last !~ /[\t ]$/) {
708 if ($last =~ /^(.*)[\t ]([$PRINTABLE]+)$/s) {
709 $triplets[-1]->[0] = $1." ";
710 $s = $2.$s;
711 } elsif (uc($lastcsetobj->as_string) eq "US-ASCII") {
712 $triplets[-1]->[0] .= $s;
713 $triplets[-1]->[1] = $enc;
714 $triplets[-1]->[2] = $csetobj;
715 next;
716 }
717 } elsif ($lastenc and !$enc and $s !~ /^[\t ]/) {
718 if ($s =~ /^([$PRINTABLE]+)[\t ](.*)$/s) {
719 $triplets[-1]->[0] .= $1;
720 $s = " ".$2;
721 } elsif (uc($csetobj->as_string) eq "US-ASCII") {
722 $triplets[-1]->[0] .= $s;
723 next;
724 }
725 }
726 }
727 push @triplets, [$s, $enc, $csetobj];
728 }
729
730 # Split long ``words''.
731 my @splitted;
732 my $restlen = $firstlinelen;
733 my $lastlen = 0;
734 foreach (@triplets) {
735 my ($s, $enc, $csetobj) = @$_;
736
737 my $restlen = $restlen - $lastlen - 1;
738 if ($restlen < ($enc? $csetobj->encoded_header_len('', $enc): 1)) {
739 $restlen = $maxlinelen - 1;
740 }
741
742 push @splitted, &_split($s, $enc, $csetobj, $restlen, $maxlinelen);
743 my ($last, $lastenc, $lastcsetobj) = @{$splitted[-1]};
744 if ($lastenc) {
745 $lastlen = $lastcsetobj->encoded_header_len($last, $lastenc);
746 } else {
747 $lastlen = length($last);
748 }
749 }
750
751 # Do encoding.
752 my @lines;
753 my $linelen = $firstlinelen;
754 foreach (@splitted) {
755 my ($str, $encoding, $charsetobj) = @$_;
756 next unless length($str);
757
758 my $s;
759 if (!$encoding) {
760 $s = $str;
761 } else {
762 $s = &encode_mimeword($str, $encoding, $charsetobj->as_string);
763 }
764
765 my $spc = (scalar(@lines) and $lines[-1] =~ /[\t ]$/)? '': ' ';
766 if (!scalar(@lines)) {
767 $s =~ s/^[\r\n\t ]+//;
768 push @lines, $s;
769 } elsif (length($lines[-1]) + length($s) <= $linelen) {
770 $lines[-1] .= $spc.$s;
771 } else {
772 $s =~ s/^[\r\n\t ]+//;
773 push @lines, $s;
774 $linelen = $maxlinelen - 1;
775 }
776 }
777
778 join("\n ", @lines);
779 }
780
781 #------------------------------
782
783 # _split RAW, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE, MAXLINELEN
784 # Private: used by encode_mimewords() to split a string into
785 # (encoded or non-encoded) words.
786 # Returns an array of arrayrefs [SUBSTRING, ENCODING, CHARSET].
787 sub _split {
788 my $str = shift;
789 my $encoding = shift;
790 my $charset = shift;
791 my $restlen = shift;
792 my $maxlinelen = shift;
793
794 if (!$charset->as_string or $charset->as_string eq '8BIT') {# Undecodable.
795 $str =~ s/[\r\n]+[\t ]*|\x00/ /g; # Eliminate hostile characters.
796 return ([$str, undef, $charset]);
797 }
798 unless ($charset->decoder) { # Unsupported charset.
799 return ([$str, $encoding, $charset]);
800 }
801 if (!$encoding and $charset->as_string eq 'US-ASCII') {
802 return &_split_ascii($str, $restlen, $maxlinelen);
803 }
804
805 my (@splitted, $ustr, $first);
806 while (length($str)) {
807 if ($charset->encoded_header_len($str, $encoding) <= $restlen) {
808 push @splitted, [$str, $encoding, $charset];
809 last;
810 }
811 $ustr = $str;
812 $ustr = $charset->decode($ustr);
813 ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset,
814 $restlen);
815 push @splitted, [$first, $encoding, $charset];
816 $restlen = $maxlinelen - 1;
817 }
818 return @splitted;
819 }
820
821 # _split_ascii RAW, ROOM_OF_FIRST_LINE, MAXLINELEN
822 # Private: used by encode_mimewords() to split an US-ASCII string into
823 # (encoded or non-encoded) words.
824 # Returns an array of arrayrefs [SUBSTRING, ENCODING, "US-ASCII"],
825 # where ENCODING is either undef or (if any unsafe sequences are
826 # included) "Q".
827 sub _split_ascii {
828 my $s = shift;
829 my $restlen = shift;
830 my $maxlinelen = shift;
831 $restlen ||= $maxlinelen - 1;
832
833 my @splitted;
834 my $obj = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD');
835 foreach my $line (split(/[\r\n]+/, $s)) {
836 $line =~ s/^[\t ]+//;
837
838 if (length($line) < $restlen and $line !~ /=\?|$UNSAFE/) {
839 push @splitted, [$line, undef, $obj];
840 $restlen = $maxlinelen - 1;
841 next;
842 }
843
844 my ($spc, $enc);
845 foreach my $word (split(/([\t ]+)/, $line)) {
846 if ($word =~ /[\t ]/) {
847 $spc = $word;
848 next;
849 }
850
851 $enc = ($word =~ /=\?|$UNSAFE/)? "Q": undef;
852 if (scalar(@splitted)) {
853 my ($last, $lastenc, $lastcsetobj) =
854 @{$splitted[-1]};
855 my ($elen, $cont, $appe);
856
857 # Concatenate adjacent words so that encoded-word and
858 # unencoded text will adjoin with separating whitespace.
859 if (!$lastenc and !$enc) {
860 $elen = length($spc.$word);
861 ($cont, $appe) = ($spc.$word, "");
862 } elsif (!$lastenc and $enc) {
863 $elen = length($spc) +
864 $obj->encoded_header_len($word, "Q");
865 ($cont, $appe) = ($spc, $word);
866 } elsif ($lastenc and !$enc) {
867 $elen = length($spc.$word);
868 ($cont, $appe) = ("", $spc.$word);
869 } else {
870 $elen = $obj->encoded_header_len($spc.$word, "Q") - 15;
871 ($cont, $appe) = ($spc.$word, "");
872 }
873
874 if ($elen <= $restlen) {
875 $splitted[-1]->[0] .= $cont if length($cont);
876 push @splitted, [$appe, $enc, $obj]
877 if length($appe);
878 $restlen -= $elen;
879 next;
880 }
881 $restlen = $maxlinelen - 1;
882 }
883 push @splitted, [$word, $enc, $obj];
884 $restlen -= ($enc?
885 $obj->encoded_header_len($word, "Q"):
886 length($word));
887 }
888 }
889 return @splitted;
890 }
891
892 # _clip_unsafe UNICODE, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE
893 # Private: used by encode_mimewords() to bite off one encodable
894 # ``word'' from a Unicode string.
895 sub _clip_unsafe {
896 my $ustr = shift;
897 my $encoding = shift;
898 my $charset = shift;
899 my $restlen = shift;
900 return ("", "") unless length($ustr);
901
902 # Seek maximal division point.
903 my ($shorter, $longer) = (0, length($ustr));
904 while ($shorter < $longer) {
905 my $cur = int(($shorter + $longer + 1) / 2);
906 my $enc = $charset->encode(substr($ustr, 0, $cur));
907 my $elen = $charset->encoded_header_len($enc, $encoding);
908 if ($elen <= $restlen) {
909 $shorter = $cur;
910 } else {
911 $longer = $cur - 1;
912 }
913 }
914
915 # Make sure that combined characters won't be divided.
916 my ($fenc, $renc);
917 my $max = length($ustr);
918 while (1) {
919 $@ = '';
920 eval {
921 ($fenc, $renc) =
922 (substr($ustr, 0, $shorter), substr($ustr, $shorter));
923 $fenc = $charset->encode($fenc, FB_CROAK());
924 $renc = $charset->encode($renc, FB_CROAK());
925 };
926 last unless ($@);
927
928 $shorter++;
929 unless ($shorter < $max) { # Unencodable characters are included.
930 return ($charset->encode($ustr), "");
931 }
932 }
933
934 if (length($fenc)) {
935 return ($fenc, $renc);
936 } else {
937 return ($renc, "");
938 }
939 }
940
941 #------------------------------
942
943 =back
944
945 =head2 Configuration Files
946 B<**>
947
948 Built-in defaults of option parameters for L<"decode_mimewords"> and
949 L<"encode_mimewords"> can be overridden by configuration files:
950 F<MIME/Charset/Defaults.pm> and F<MIME/EncWords/Defaults.pm>.
951 For more details read F<MIME/EncWords/Defaults.pm.sample>.
952
953 =head1 VERSION
954
955 Consult $VERSION variable.
956
957 Development versions of this module may be found at
958 L<http://hatuka.nezumi.nu/repos/MIME-EncWords/>.
959
960 =head1 SEE ALSO
961
962 L<MIME::Charset>,
963 L<MIME::Tools>
964
965 =head1 AUTHORS
966
967 The original version of function decode_mimewords() is derived from
968 L<MIME::Words> module that was written by:
969 Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>).
970 David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com
971
972 Other stuff are rewritten or added by:
973 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
974
975 All rights reserved. This program is free software; you can redistribute
976 it and/or modify it under the same terms as Perl itself.
977
978 =cut
979
980 1;

  ViewVC Help
Powered by ViewVC 1.1.5