/[MIME-EncWords]/tags/MIME-EncWords-1.001/EncWords.pm
ViewVC logotype

Contents of /tags/MIME-EncWords-1.001/EncWords.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 24 - (show annotations) (download)
Sun Mar 9 10:27:43 2008 UTC (2 years, 5 months ago) by hatukanezumi
File size: 25073 byte(s)
Fix
1
2 package MIME::EncWords;
3 use 5.005;
4
5 =head1 NAME
6
7 MIME::EncWords - deal with RFC 2047 encoded words (improved)
8
9 =head1 SYNOPSIS
10
11 I<L<MIME::EncWords> is aimed to be another implimentation
12 of L<MIME::Words> so that it will achive more exact conformance with
13 RFC 2047 (former RFC 1522) specifications. Additionally, it contains
14 some improvements.
15 Following synopsis and descriptions are inherited from its inspirer,
16 with description of improvements and clarifications added.>
17
18 Before reading further, you should see L<MIME::Tools> to make sure that
19 you understand where this module fits into the grand scheme of things.
20 Go on, do it now. I'll wait.
21
22 Ready? Ok...
23
24 use MIME::EncWords qw(:all);
25
26 ### Decode the string into another string, forgetting the charsets:
27 $decoded = decode_mimewords(
28 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
29 );
30
31 ### Split string into array of decoded [DATA,CHARSET] pairs:
32 @decoded = decode_mimewords(
33 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
34 );
35
36 ### Encode a single unsafe word:
37 $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
38
39 ### Encode a string, trying to find the unsafe words inside it:
40 $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
41
42 =head1 DESCRIPTION
43
44 Fellow Americans, you probably won't know what the hell this module
45 is for. Europeans, Russians, et al, you probably do. C<:-)>.
46
47 For example, here's a valid MIME header you might get:
48
49 From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
50 To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
51 CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>
52 Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
53 =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
54 =?US-ASCII?Q?.._cool!?=
55
56 The fields basically decode to (sorry, I can only approximate the
57 Latin characters with 7 bit sequences /o and 'e):
58
59 From: Keith Moore <moore@cs.utk.edu>
60 To: Keld J/orn Simonsen <keld@dkuug.dk>
61 CC: Andr'e Pirard <PIRARD@vm1.ulg.ac.be>
62 Subject: If you can read this you understand the example... cool!
63
64 B<Supplement>: Fellow Americans, Europeans, you probably won't know
65 what the hell this module is for. East Asians, et al, you probably do.
66 C<:-)>.
67
68 For example, here's a valid MIME header you might get:
69
70 Subject: =?EUC-KR?B?sNTAuLinKGxhemluZXNzKSwgwvzB9ri7seIoaW1w?=
71 =?EUC-KR?B?YXRpZW5jZSksILGzuLgoaHVicmlzKQ==?=
72
73 The fields basically decode to (sorry, I cannot approximate the
74 non-Latin multibyte characters with any 7 bit sequences):
75
76 Subject: ???(laziness), ????(impatience), ??(hubris)
77
78 =head1 PUBLIC INTERFACE
79
80 =over 4
81
82 =cut
83
84 ### Pragmas:
85 use strict;
86 use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
87
88 ### Exporting:
89 use Exporter;
90
91 %EXPORT_TAGS = (all => [qw(decode_mimewords
92 encode_mimeword
93 encode_mimewords)]);
94 Exporter::export_ok_tags(qw(all));
95
96 ### Inheritance:
97 @ISA = qw(Exporter);
98
99 ### Other modules:
100 use Carp;
101 use MIME::Base64;
102 use MIME::Charset qw(:trans);
103
104 my @ENCODE_SUBS = qw(FB_CROAK decode encode from_to is_utf8 resolve_alias);
105 if (MIME::Charset::USE_ENCODE) {
106 eval "use ".MIME::Charset::USE_ENCODE." \@ENCODE_SUBS;";
107 } else {
108 require MIME::Charset::_Compat;
109 for my $sub (@ENCODE_SUBS) {
110 no strict "refs";
111 *{$sub} = \&{"MIME::Charset::_Compat::$sub"};
112 }
113 }
114
115 #------------------------------
116 #
117 # Globals...
118 #
119 #------------------------------
120
121 ### The package version, both in 1.23 style *and* usable by MakeMaker:
122 $VERSION = '1.001';
123
124 ### Nonprintables (controls + x7F + 8bit):
125 #my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
126 my $PRINTABLE = "\\x21-\\x7E";
127 my $NONPRINT = qr{[^$PRINTABLE]}; # Improvement: Unicode support.
128 my $UNSAFE = qr{[^\x01-\x20$PRINTABLE]};
129 my $WIDECHAR = qr{[^\x00-\xFF]};
130
131 ### Max line length:
132 my $MAXLINELEN = 76;
133
134 #------------------------------
135
136 # _decode_B STRING
137 # Private: used by _decode_header() to decode "B" encoding.
138 # Improvement by this module: sanity check on encoded sequence.
139 sub _decode_B {
140 my $str = shift;
141 unless ((length($str) % 4 == 0) and
142 $str =~ m|^[A-Za-z0-9+/]+={0,2}$|) {
143 return undef;
144 }
145 return decode_base64($str);
146 }
147
148 # _decode_Q STRING
149 # Private: used by _decode_header() to decode "Q" encoding, which is
150 # almost, but not exactly, quoted-printable. :-P
151 sub _decode_Q {
152 my $str = shift;
153 $str =~ s/_/\x20/g; # RFC-1522, Q rule 2
154 $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
155 $str;
156 }
157
158 # _encode_B STRING
159 # Private: used by encode_mimeword() to encode "B" encoding.
160 sub _encode_B {
161 my $str = shift;
162 encode_base64($str, '');
163 }
164
165 # _encode_Q STRING
166 # Private: used by encode_mimeword() to encode "Q" encoding, which is
167 # almost, but not exactly, quoted-printable. :-P
168 # Improvement by this module: Spaces are escaped by ``_''.
169 sub _encode_Q {
170 my $str = shift;
171 # $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
172 $str =~ s{(\x20)|([_?=]|$NONPRINT)}{
173 defined $1? "_": sprintf("=%02X", ord($2))
174 }eog;
175 $str;
176 }
177
178 #------------------------------
179
180 =item decode_mimewords ENCODED, [OPTS...]
181
182 I<Function.>
183 Go through the string looking for RFC-1522-style "Q"
184 (quoted-printable, sort of) or "B" (base64) encoding, and decode them.
185
186 B<In an array context,> splits the ENCODED string into a list of decoded
187 C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
188 data are returned in a 1-element array C<[DATA]>, giving an effective
189 CHARSET of C<undef>.
190
191 $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
192 foreach (decode_mimewords($enc)) {
193 print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n";
194 }
195
196 B<In a scalar context,> joins the "data" elements of the above
197 list together, and returns that. I<Warning: this is information-lossy,>
198 and probably I<not> what you want, but if you know that all charsets
199 in the ENCODED string are identical, it might be useful to you.
200 (Before you use this, please see L<MIME::WordDecoder/unmime>,
201 which is probably what you want.)
202 B<Note>: See also "Charset" option below.
203
204 In the event of a syntax error, $@ will be set to a description
205 of the error, but parsing will continue as best as possible (so as to
206 get I<something> back when decoding headers).
207 $@ will be false if no error was detected.
208
209 Any arguments past the ENCODED string are taken to define a hash of options:
210
211 =over 4
212
213 =item Charset
214
215 B<Improvement by this module>:
216 Name of character set by which data elements in scalar context
217 will be converted.
218 If this option is specified as special value C<"_UNICODE_">,
219 returned value will be Unicode string.
220
221 When Unicode/multibyte support is disabled
222 (see L<MIME::Charset/USE_ENCODE>),
223 this option will not have any effects.
224
225 B<Note>:
226 This feature is still information-lossy, I<except> when C<"_UNICODE_"> is
227 specified.
228
229 =item Detect7bit
230
231 B<Improvement by this modlue>:
232 Try to detect 7-bit charset on unencoded portions.
233 Default is C<"YES">.
234 When Unicode/multibyte support is disabled,
235 this option will not have any effects
236 (see L<MIME::Charset/USE_ENCODE>).
237 B<Note>:
238 This feature was introduced at release 1.000.
239
240 =item Field
241
242 Name of the mail field this string came from. I<Currently ignored.>
243
244 =back
245
246 B<Improvement by this module>:
247 Adjacent encoded-words with same charset will be concatenated
248 to handle multibyte sequences safely.
249
250 B<Change by this module>:
251 Malformed base64 encoded-words will be kept encoded.
252 In this case $@ will be set.
253
254 B<Compatibility with MIME::Words>:
255 Whitespaces surrounding unencoded data will not be stripped.
256
257 =cut
258
259 sub decode_mimewords {
260 my $encstr = shift;
261 my %params = @_;
262 my $cset = $params{"Charset"};
263 my $detect7bit = uc($params{'Detect7bit'} || "YES");
264 my @tokens;
265 $@ = ''; ### error-return
266
267 ### Decode:
268 my ($word, $charset, $encoding, $enc, $dec);
269 my $spc = '';
270 pos($encstr) = 0;
271 while (1) {
272 last if (pos($encstr) >= length($encstr));
273 my $pos = pos($encstr); ### save it
274
275 ### Case 1: are we looking at "=?..?..?="?
276 if ($encstr =~ m{\G # from where we left off..
277 =\?([^?]*) # "=?" + charset +
278 \?([bq]) # "?" + encoding +
279 \?([^?]+) # "?" + data maybe with spcs +
280 \?= # "?="
281 ([\r\n\t ]*)
282 }xgi) {
283 ($word, $charset, $encoding, $enc) = ($&, $1, lc($2), $3);
284 my $tspc = $4;
285 if ($encoding eq 'q') {
286 $dec = _decode_Q($enc);
287 } else {
288 $dec = _decode_B($enc);
289 }
290 unless (defined $dec) {
291 $@ .= qq|Illegal sequence in "$word" (pos $pos)\n|;
292 push @tokens, [$spc.$word];
293 $spc = '';
294 next;
295 }
296
297 if (scalar(@tokens) and
298 lc($charset) eq lc(${$tokens[-1]}[1]) and
299 resolve_alias($charset)) { # Concat words if possible.
300 ${$tokens[-1]}[0] .= $dec;
301 } else {
302 push @tokens, [$dec, $charset];
303 }
304 $spc = $tspc;
305 next;
306 }
307
308 ### Case 2: are we looking at a bad "=?..." prefix?
309 ### We need this to detect problems for case 3, which stops at "=?":
310 pos($encstr) = $pos; # reset the pointer.
311 if ($encstr =~ m{\G=\?}xg) {
312 $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
313 push @tokens, [$spc.'=?'];
314 $spc = '';
315 next;
316 }
317
318 ### Case 3: are we looking at ordinary text?
319 pos($encstr) = $pos; # reset the pointer.
320 if ($encstr =~ m{\G # from where we left off...
321 (.*? # shortest possible string,
322 \n*) # followed by 0 or more NLs,
323 (?=(\Z|=\?)) # terminated by "=?" or EOS
324 }xgs) {
325 length($1) or croak "MIME::EncWords: internal logic err: empty token\n";
326 push @tokens, [$spc.$1];
327 $spc = '';
328 next;
329 }
330
331 ### Case 4: bug!
332 croak "MIME::EncWords: unexpected case:\n($encstr) pos $pos\n\t".
333 "Please alert developer.\n";
334 }
335 push @tokens, [$spc] if $spc;
336
337 # Detect 7-bit charset
338 if ($detect7bit ne "NO") {
339 foreach my $t (@tokens) {
340 unless ($t->[1]) {
341 my $charset = &MIME::Charset::_detect_7bit_charset($t->[0]);
342 if ($charset and $charset ne &MIME::Charset::default()) {
343 $t->[0] =~ s/[\r\n\t ]+/ /g;
344 $t->[1] = $charset;
345 }
346 }
347 }
348 }
349
350 return (wantarray ? @tokens : join('',map {
351 &_convert($_->[0], $_->[1], $cset)
352 } @tokens));
353 }
354
355 #------------------------------
356
357 # _convert RAW, FROMCHARSET, TOCHARSET
358 # Private: used by encode_mimewords() to convert string by other charset
359 # or to decode to Unicode.
360 # When source charset is unknown and Unicode string is requested, at first
361 # try well-formed UTF-8 then fallback to ISO-8859-1 so that almost all
362 # non-ASCII bytes will be preserved.
363 sub _convert($$$) {
364 my $s = shift;
365 my $charset = shift || "";
366 my $cset = shift;
367 return $s unless MIME::Charset::USE_ENCODE;
368 return $s unless $cset;
369 return $s if uc($charset) eq uc($cset);
370
371 my $preserveerr = $@;
372
373 my $converted = $s;
374 if (is_utf8($s) or $s =~ $WIDECHAR) {
375 if ($cset ne "_UNICODE_") {
376 $converted = encode($cset, $converted);
377 }
378 } elsif ($cset eq "_UNICODE_") {
379 if (!resolve_alias($charset)) {
380 if ($s =~ $UNSAFE) {
381 $@ = '';
382 eval {
383 $converted = decode("UTF-8", $converted, FB_CROAK());
384 };
385 if ($@) {
386 $converted = $s;
387 $converted = decode("ISO-8859-1", $converted);
388 }
389 }
390 } else {
391 $converted = decode($charset, $converted);
392 }
393 } elsif (resolve_alias($charset)) {
394 from_to($converted, $charset, $cset);
395 }
396
397 $@ = $preserveerr;
398 return $converted;
399 }
400
401 #------------------------------
402
403 =item encode_mimeword RAW, [ENCODING], [CHARSET]
404
405 I<Function.>
406 Encode a single RAW "word" that has unsafe characters.
407 The "word" will be encoded in its entirety.
408
409 ### Encode "<<Franc,ois>>":
410 $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
411
412 You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
413 B<Improvement by this module>:
414 You may also specify it as ``special'' value: C<"S"> to choose shorter
415 one of either C<"Q"> or C<"B">.
416
417 You may specify the CHARSET, which defaults to C<iso-8859-1>.
418
419 B<Change by this module>:
420 Spaces will be escaped with ``_'' by C<"Q"> encoding.
421
422 =cut
423
424 sub encode_mimeword {
425 my $word = shift;
426 my $encoding = uc(shift || 'Q');
427 my $charset = uc(shift || 'ISO-8859-1');
428
429 my $encstr;
430 if ($encoding eq 'Q') {
431 $encstr = &_encode_Q($word);
432 } elsif ($encoding eq "S") {
433 my ($B, $Q) = (&_encode_B($word), &_encode_Q($word));
434 if (length($B) < length($Q)) {
435 $encoding = "B";
436 $encstr = $B;
437 } else {
438 $encoding = "Q";
439 $encstr = $Q;
440 }
441 } else { # "B"
442 $encoding = "B";
443 $encstr = &_encode_B($word);
444 }
445
446 "=?$charset?$encoding?$encstr?=";
447 }
448
449 #------------------------------
450
451 =item encode_mimewords RAW, [OPTS]
452
453 I<Function.>
454 Given a RAW string, try to find and encode all "unsafe" sequences
455 of characters:
456
457 ### Encode a string with some unsafe "words":
458 $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
459
460 Returns the encoded string.
461
462 B<Improvement by this module>:
463 RAW may be a Unicode string when Unicode/multibyte support is enabled
464 (see L<MIME::Charset/USE_ENCODE>).
465 Furthermore, RAW may be a reference to that returned
466 by L<"decode_mimewords"> on array context. In latter case "Charset"
467 option (see below) will be overridden (see also notes below).
468
469 Any arguments past the RAW string are taken to define a hash of options:
470
471 =over 4
472
473 =item Charset
474
475 Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
476 a.k.a. "Latin-1".
477
478 =item Detect7bit
479
480 B<Improvement by this modlue>:
481 When "Encoding" option (see below) is specified as C<"a"> and "Charset"
482 option is unknown, try to detect 7-bit charset on given RAW string.
483 Default is C<"YES">.
484 When Unicode/multibyte support is disabled,
485 this option will not have any effects
486 (see L<MIME::Charset/USE_ENCODE>).
487
488 =item Encoding
489
490 The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
491 B<Improvement by this module>:
492 You may also specify ``special'' values: C<"a"> will automatically choose
493 recommended encoding to use (with charset conversion if alternative
494 charset is recommended: see L<MIME::Charset>);
495 C<"s"> will choose shorter one of either C<"q"> or C<"b">.
496
497 =item Field
498
499 Name of the mail field this string will be used in.
500 B<Improvement by this module>:
501 Length of mail field name will be considered in the first line of
502 encoded header.
503
504 =item Minimal
505
506 B<Improvement by this module>:
507 Takes care of natural word separators (i.e. whitespaces)
508 in the text to be encoded.
509 If C<"NO"> is specified, this module will encode whole text
510 (if encoding needed) not regarding whitespaces;
511 encoded-words exceeding line length will be splitted based only on their
512 lengths.
513 Default is C<"YES">.
514
515 B<Note>:
516 As of release 0.040, default has been changed to C<"YES"> to ensure
517 compatibility with MIME::Words.
518 On earlier releases, this option was fixed to be C<"NO">.
519
520 =item Replacement
521
522 B<Improvement by this module>:
523 See L<MIME::Charset/ERROR HANDLING>.
524 B<Note>:
525 This feature was introduced at release 1.000.
526
527 =back
528
529 B<Notes on improvement by this module>:
530 When RAW is an arrayref,
531 adjacent encoded-words are concatenated. Then they are splitted taking
532 care of character boundaries of multibyte sequences, when Unicode/multibyte
533 support is enabled.
534 Portions for unencoded data should include surrounding whitespace(s), or
535 they will be merged into adjoining encoded word(s).
536
537 =cut
538
539 sub encode_mimewords {
540 my $words = shift;
541 my %params = @_;
542 my $charset = uc($params{'Charset'});
543 my $detect7bit = uc($params{'Detect7bit'} || "YES");
544 my $encoding = uc($params{'Encoding'});
545 my $header_name = $params{'Field'};
546 my $minimal = uc($params{'Minimal'} || "YES");
547 my $replacement = uc($params{'Replacement'} || 'DEFAULT');
548 my $firstlinelen = $MAXLINELEN;
549 if ($header_name) {
550 $firstlinelen -= length($header_name.': ');
551 }
552
553 unless (ref($words) eq "ARRAY") {
554 if ($minimal eq "YES") {
555 my @words = map {[$_, $charset]} split(/((?:\A|[\t ])[\t \x21-\x7E]+(?:[\t ]|\Z))/, $words);
556 $words = \@words;
557 } else {
558 $words = [[$words, $charset]];
559 }
560 }
561
562 # Translate / concatenate words.
563 my @triplets;
564 foreach (@$words) {
565 my ($s, $cset) = @$_;
566 my $enc;
567
568 next unless length($s);
569
570 # Unicode string should be encoded by given charset.
571 # Unsupported charset will be fallbacked to UTF-8.
572 if (is_utf8($s) or $s =~ $WIDECHAR) {
573 unless (resolve_alias($cset)) {
574 if ($s !~ $UNSAFE) {
575 $cset = "US-ASCII";
576 } elsif ($replacement =~ '^(CROAK|STRICT)$') {
577 croak "MIME::EncWords: unsupported charset: $cset\n";
578 } else {
579 $cset = "UTF-8";
580 }
581 }
582 if ($replacement =~ /^(CROAK|STRICT)$/) {
583 $s = encode($cset, $s, FB_CROAK());
584 } else {
585 $s = encode($cset, $s);
586 }
587 }
588
589 # Determine charset and encoding.
590 if ($encoding eq "A") {
591 ($s, $cset, $enc) =
592 header_encode($s, $cset || $charset,
593 Detect7bit => $detect7bit,
594 Replacement => $replacement);
595 } else {
596 $cset ||= ($charset || ($s !~ $UNSAFE)? "US-ASCII": "ISO-8859-1");
597 my @spec =
598 header_encode($s, $cset,
599 Detect7bit => $detect7bit,
600 Replacement => $replacement);
601 if ($spec[0] eq $s and $spec[1] eq "US-ASCII" and !$spec[2]) {
602 ($cset, $enc) = ("US-ASCII", undef);
603 } else {
604 $enc = $encoding || "Q";
605 }
606 }
607
608 # Concatenate adjacent ``words'' so that multibyte sequences will
609 # be handled safely.
610 # Note: Encoded-word and unencoded text must not adjoin without
611 # separating whitespace(s).
612 if (scalar(@triplets)) {
613 my ($last, $lastenc, $lastcset) = @{$triplets[-1]};
614 if (uc($lastcset) eq uc($cset) and uc($lastenc) eq uc($enc) and
615 resolve_alias($cset)) {
616 ${$triplets[-1]}[0] .= $s;
617 next;
618 } elsif (!$lastenc and $enc and $last !~ /[\t ]$/) {
619 if ($last =~ /^(.*)[\t ]([$PRINTABLE]+)$/s) {
620 ${$triplets[-1]}[0] = $1." ";
621 $s = $2.$s;
622 } elsif (uc($lastcset) eq "US-ASCII") {
623 ${$triplets[-1]}[0] .= $s;
624 ${$triplets[-1]}[1] = $enc;
625 ${$triplets[-1]}[2] = $cset;
626 next;
627 }
628 } elsif ($lastenc and !$enc and $s !~ /^[\t ]/) {
629 if ($s =~ /^([$PRINTABLE]+)[\t ](.*)$/s) {
630 ${$triplets[-1]}[0] .= $1;
631 $s = " ".$2;
632 } elsif (uc($cset) eq "US-ASCII") {
633 ${$triplets[-1]}[0] .= $s;
634 next;
635 }
636 }
637 }
638 push @triplets, [$s, $enc, $cset];
639 }
640
641 # Split long ``words''.
642 my @splitted;
643 my $restlen = $firstlinelen;
644 my $lastlen = 0;
645 foreach (@triplets) {
646 my ($s, $enc, $cset) = @$_;
647
648 my $restlen = $restlen - $lastlen - 1;
649 if ($restlen < ($enc? encoded_header_len('', $enc, $cset): 1)) {
650 $restlen = $MAXLINELEN - 1;
651 }
652
653 push @splitted, &_split($s, $enc, $cset, $restlen);
654 my ($last, $lastenc, $lastcset) = @{$splitted[-1]};
655 if ($lastenc) {
656 $lastlen = encoded_header_len($last, $lastenc, $lastcset);
657 } else {
658 $lastlen = length($last);
659 }
660 }
661
662 # Do encoding.
663 my @lines;
664 my $linelen = $firstlinelen;
665 foreach (@splitted) {
666 my ($str, $encoding, $charset) = @$_;
667 next unless length($str);
668
669 my $s;
670 if (!$encoding) {
671 $s = $str;
672 } else {
673 $s = &encode_mimeword($str, $encoding, $charset);
674 }
675
676 my $spc = (scalar(@lines) and $lines[-1] =~ /[\t ]$/)? '': ' ';
677 if (!scalar(@lines)) {
678 $s =~ s/^[\r\n\t ]+//;
679 push @lines, $s;
680 } elsif (length($lines[-1]) + length($s) <= $linelen) {
681 $lines[-1] .= $spc.$s;
682 } else {
683 $s =~ s/^[\r\n\t ]+//;
684 push @lines, $s;
685 $linelen = $MAXLINELEN - 1;
686 }
687 }
688
689 join("\n ", @lines);
690 }
691
692 #------------------------------
693
694 # _split RAW, ENCODING, CHARSET, ROOM_OF_FIRST_LINE
695 # Private: used by encode_mimewords() to split a string into
696 # (encoded or non-encoded) words.
697 # Returns an array of arrayrefs [SUBSTRING, ENCODING, CHARSET].
698 sub _split {
699 my $str = shift;
700 my $encoding = shift;
701 my $charset = shift;
702 my $restlen = shift;
703
704 if (!$charset or $charset eq '8BIT') { # Undecodable.
705 $str =~ s/[\r\n]+[\t ]*|\x00/ /g; # Eliminate hostile characters.
706 return ([$str, undef, $charset]);
707 }
708 unless (resolve_alias($charset)) { # Unsupported charset.
709 return ([$str, $encoding, $charset]);
710 }
711 if (!$encoding and $charset eq 'US-ASCII') {
712 return &_split_ascii($str, $restlen);
713 }
714
715 my (@splitted, $ustr, $first);
716 while (length($str)) {
717 if (encoded_header_len($str, $encoding, $charset) <= $restlen) {
718 push @splitted, [$str, $encoding, $charset];
719 last;
720 }
721 $ustr = $str;
722 $ustr = decode($charset, $ustr);
723 ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, $restlen);
724 push @splitted, [$first, $encoding, $charset];
725 $restlen = $MAXLINELEN - 1;
726 }
727 return @splitted;
728 }
729
730 # _split_ascii RAW, ROOM_OF_FIRST_LINE
731 # Private: used by encode_mimewords() to split an US-ASCII string into
732 # (encoded or non-encoded) words.
733 # Returns an array of arrayrefs [SUBSTRING, ENCODING, "US-ASCII"],
734 # where ENCODING is either undef or (if any unsafe sequences are
735 # included) "Q".
736 sub _split_ascii {
737 my $s = shift;
738 my $restlen = shift || $MAXLINELEN - 1;
739
740 my @splitted;
741 foreach my $line (split(/[\r\n]+/, $s)) {
742 $line =~ s/^[\t ]+//;
743
744 if (length($line) < $restlen and $line !~ /=\?|$UNSAFE/) {
745 push @splitted, [$line, undef, "US-ASCII"];
746 $restlen = $MAXLINELEN - 1;
747 next;
748 }
749
750 my ($spc, $enc);
751 foreach my $word (split(/([\t ]+)/, $line)) {
752 if ($word =~ /[\t ]/) {
753 $spc = $word;
754 next;
755 }
756
757 $enc = ($word =~ /=\?|$UNSAFE/)? "Q": undef;
758 if (scalar(@splitted)) {
759 my ($last, $lastenc, $lastcset) = @{$splitted[-1]};
760 my ($elen, $cont, $appe);
761
762 # Concatenate adjacent words so that encoded-word and
763 # unencoded text will adjoin with separating whitespace.
764 if (!$lastenc and !$enc) {
765 $elen = length($spc.$word);
766 ($cont, $appe) = ($spc.$word, "");
767 } elsif (!$lastenc and $enc) {
768 $elen = length($spc) +
769 encoded_header_len($word, "Q", "US-ASCII");
770 ($cont, $appe) = ($spc, $word);
771 } elsif ($lastenc and !$enc) {
772 $elen = length($spc.$word);
773 ($cont, $appe) = ("", $spc.$word);
774 } else {
775 $elen = encoded_header_len($spc.$word, "Q",
776 "US-ASCII") - 15;
777 ($cont, $appe) = ($spc.$word, "");
778 }
779
780 if ($elen <= $restlen) {
781 ${$splitted[-1]}[0] .= $cont if length($cont);
782 push @splitted, [$appe, $enc, "US-ASCII"] if length($appe);
783 $restlen -= $elen;
784 next;
785 }
786 $restlen = $MAXLINELEN - 1;
787 }
788 push @splitted, [$word, $enc, "US-ASCII"];
789 $restlen -= ($enc?
790 encoded_header_len($word, "Q", "US-ASCII"):
791 length($word));
792 }
793 }
794 return @splitted;
795 }
796
797 # _clip_unsafe UNICODE, ENCODING, CHARSET, ROOM_OF_FIRST_LINE
798 # Private: used by encode_mimewords() to bite off one encodable
799 # ``word'' from a Unicode string.
800 sub _clip_unsafe {
801 my $ustr = shift;
802 my $encoding = shift;
803 my $charset = shift;
804 my $restlen = shift;
805 return ("", "") unless length($ustr);
806
807 # Seek maximal division point.
808 my ($shorter, $longer) = (0, length($ustr));
809 while ($shorter < $longer) {
810 my $cur = int(($shorter + $longer + 1) / 2);
811 my $enc = encode($charset, substr($ustr, 0, $cur));
812 my $elen = encoded_header_len($enc, $encoding, $charset);
813 if ($elen <= $restlen) {
814 $shorter = $cur;
815 } else {
816 $longer = $cur - 1;
817 }
818 }
819
820 # Make sure that combined characters won't be divided.
821 my ($fenc, $renc);
822 my $max = length($ustr);
823 while (1) {
824 $@ = '';
825 eval {
826 ($fenc, $renc) =
827 (substr($ustr, 0, $shorter), substr($ustr, $shorter));
828 $fenc = encode($charset, $fenc, FB_CROAK());
829 $renc = encode($charset, $renc, FB_CROAK());
830 };
831 last unless ($@);
832
833 $shorter++;
834 unless ($shorter < $max) { # Unencodable characters are included.
835 return (encode($charset, $ustr), "");
836 }
837 }
838
839 if (length($fenc)) {
840 return ($fenc, $renc);
841 } else {
842 return ($renc, "");
843 }
844 }
845
846 #------------------------------
847
848 =back
849
850 =head1 VERSION
851
852 Consult $VERSION variable.
853
854 Development versions of this module may be found at
855 L<http://hatuka.nezumi.nu/repos/MIME-EncWords/>.
856
857 =head1 SEE ALSO
858
859 L<MIME::Charset>,
860 L<MIME::Tools>
861
862 =head1 AUTHORS
863
864 The original version of function decode_mimewords() is derived from
865 L<MIME::Words> module that was written by:
866 Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>).
867 David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com
868
869 Other stuff are rewritten or added by:
870 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
871
872 All rights reserved. This program is free software; you can redistribute
873 it and/or modify it under the same terms as Perl itself.
874
875 =cut
876
877 1;

  ViewVC Help
Powered by ViewVC 1.1.5