| 1 |
|
| 2 |
package MIME::EncWords;
|
| 3 |
use 5.005;
|
| 4 |
|
| 5 |
=head1 NAME
|
| 6 |
|
| 7 |
MIME::EncWords - deal with RFC 2047 encoded words (improved)
|
| 8 |
|
| 9 |
=head1 SYNOPSIS
|
| 10 |
|
| 11 |
I<L<MIME::EncWords> is aimed to be another implimentation
|
| 12 |
of L<MIME::Words> so that it will achive more exact conformance with
|
| 13 |
RFC 2047 (former RFC 1522) specifications. Additionally, it contains
|
| 14 |
some improvements.
|
| 15 |
Following synopsis and descriptions are inherited from its inspirer,
|
| 16 |
with description of improvements and clarifications added.>
|
| 17 |
|
| 18 |
Before reading further, you should see L<MIME::Tools> to make sure that
|
| 19 |
you understand where this module fits into the grand scheme of things.
|
| 20 |
Go on, do it now. I'll wait.
|
| 21 |
|
| 22 |
Ready? Ok...
|
| 23 |
|
| 24 |
use MIME::EncWords qw(:all);
|
| 25 |
|
| 26 |
### Decode the string into another string, forgetting the charsets:
|
| 27 |
$decoded = decode_mimewords(
|
| 28 |
'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
|
| 29 |
);
|
| 30 |
|
| 31 |
### Split string into array of decoded [DATA,CHARSET] pairs:
|
| 32 |
@decoded = decode_mimewords(
|
| 33 |
'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
|
| 34 |
);
|
| 35 |
|
| 36 |
### Encode a single unsafe word:
|
| 37 |
$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
|
| 38 |
|
| 39 |
### Encode a string, trying to find the unsafe words inside it:
|
| 40 |
$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
|
| 41 |
|
| 42 |
=head1 DESCRIPTION
|
| 43 |
|
| 44 |
Fellow Americans, you probably won't know what the hell this module
|
| 45 |
is for. Europeans, Russians, et al, you probably do. C<:-)>.
|
| 46 |
|
| 47 |
For example, here's a valid MIME header you might get:
|
| 48 |
|
| 49 |
From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
|
| 50 |
To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
|
| 51 |
CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>
|
| 52 |
Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
|
| 53 |
=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
|
| 54 |
=?US-ASCII?Q?.._cool!?=
|
| 55 |
|
| 56 |
The fields basically decode to (sorry, I can only approximate the
|
| 57 |
Latin characters with 7 bit sequences /o and 'e):
|
| 58 |
|
| 59 |
From: Keith Moore <moore@cs.utk.edu>
|
| 60 |
To: Keld J/orn Simonsen <keld@dkuug.dk>
|
| 61 |
CC: Andr'e Pirard <PIRARD@vm1.ulg.ac.be>
|
| 62 |
Subject: If you can read this you understand the example... cool!
|
| 63 |
|
| 64 |
B<Supplement>: Fellow Americans, Europeans, you probably won't know
|
| 65 |
what the hell this module is for. East Asians, et al, you probably do.
|
| 66 |
C<:-)>.
|
| 67 |
|
| 68 |
For example, here's a valid MIME header you might get:
|
| 69 |
|
| 70 |
Subject: =?EUC-KR?B?sNTAuLinKGxhemluZXNzKSwgwvzB9ri7seIoaW1w?=
|
| 71 |
=?EUC-KR?B?YXRpZW5jZSksILGzuLgoaHVicmlzKQ==?=
|
| 72 |
|
| 73 |
The fields basically decode to (sorry, I cannot approximate the
|
| 74 |
non-Latin multibyte characters with any 7 bit sequences):
|
| 75 |
|
| 76 |
Subject: ???(laziness), ????(impatience), ??(hubris)
|
| 77 |
|
| 78 |
=head1 PUBLIC INTERFACE
|
| 79 |
|
| 80 |
=over 4
|
| 81 |
|
| 82 |
=cut
|
| 83 |
|
| 84 |
### Pragmas:
|
| 85 |
use strict;
|
| 86 |
use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
|
| 87 |
|
| 88 |
### Exporting:
|
| 89 |
use Exporter;
|
| 90 |
|
| 91 |
%EXPORT_TAGS = (all => [qw(decode_mimewords
|
| 92 |
encode_mimeword
|
| 93 |
encode_mimewords)]);
|
| 94 |
Exporter::export_ok_tags(qw(all));
|
| 95 |
|
| 96 |
### Inheritance:
|
| 97 |
@ISA = qw(Exporter);
|
| 98 |
|
| 99 |
### Other modules:
|
| 100 |
use Carp;
|
| 101 |
use MIME::Base64;
|
| 102 |
use MIME::Charset qw(:trans);
|
| 103 |
|
| 104 |
my @ENCODE_SUBS = qw(FB_CROAK decode encode from_to is_utf8 resolve_alias);
|
| 105 |
if (MIME::Charset::USE_ENCODE) {
|
| 106 |
eval "use ".MIME::Charset::USE_ENCODE." \@ENCODE_SUBS;";
|
| 107 |
} else {
|
| 108 |
require MIME::Charset::_Compat;
|
| 109 |
for my $sub (@ENCODE_SUBS) {
|
| 110 |
no strict "refs";
|
| 111 |
*{$sub} = \&{"MIME::Charset::_Compat::$sub"};
|
| 112 |
}
|
| 113 |
}
|
| 114 |
|
| 115 |
#------------------------------
|
| 116 |
#
|
| 117 |
# Globals...
|
| 118 |
#
|
| 119 |
#------------------------------
|
| 120 |
|
| 121 |
### The package version, both in 1.23 style *and* usable by MakeMaker:
|
| 122 |
$VERSION = '1.001';
|
| 123 |
|
| 124 |
### Nonprintables (controls + x7F + 8bit):
|
| 125 |
#my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
|
| 126 |
my $PRINTABLE = "\\x21-\\x7E";
|
| 127 |
my $NONPRINT = qr{[^$PRINTABLE]}; # Improvement: Unicode support.
|
| 128 |
my $UNSAFE = qr{[^\x01-\x20$PRINTABLE]};
|
| 129 |
my $WIDECHAR = qr{[^\x00-\xFF]};
|
| 130 |
|
| 131 |
### Max line length:
|
| 132 |
my $MAXLINELEN = 76;
|
| 133 |
|
| 134 |
#------------------------------
|
| 135 |
|
| 136 |
# _decode_B STRING
|
| 137 |
# Private: used by _decode_header() to decode "B" encoding.
|
| 138 |
# Improvement by this module: sanity check on encoded sequence.
|
| 139 |
sub _decode_B {
|
| 140 |
my $str = shift;
|
| 141 |
unless ((length($str) % 4 == 0) and
|
| 142 |
$str =~ m|^[A-Za-z0-9+/]+={0,2}$|) {
|
| 143 |
return undef;
|
| 144 |
}
|
| 145 |
return decode_base64($str);
|
| 146 |
}
|
| 147 |
|
| 148 |
# _decode_Q STRING
|
| 149 |
# Private: used by _decode_header() to decode "Q" encoding, which is
|
| 150 |
# almost, but not exactly, quoted-printable. :-P
|
| 151 |
sub _decode_Q {
|
| 152 |
my $str = shift;
|
| 153 |
$str =~ s/_/\x20/g; # RFC-1522, Q rule 2
|
| 154 |
$str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
|
| 155 |
$str;
|
| 156 |
}
|
| 157 |
|
| 158 |
# _encode_B STRING
|
| 159 |
# Private: used by encode_mimeword() to encode "B" encoding.
|
| 160 |
sub _encode_B {
|
| 161 |
my $str = shift;
|
| 162 |
encode_base64($str, '');
|
| 163 |
}
|
| 164 |
|
| 165 |
# _encode_Q STRING
|
| 166 |
# Private: used by encode_mimeword() to encode "Q" encoding, which is
|
| 167 |
# almost, but not exactly, quoted-printable. :-P
|
| 168 |
# Improvement by this module: Spaces are escaped by ``_''.
|
| 169 |
sub _encode_Q {
|
| 170 |
my $str = shift;
|
| 171 |
# $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
|
| 172 |
$str =~ s{(\x20)|([_?=]|$NONPRINT)}{
|
| 173 |
defined $1? "_": sprintf("=%02X", ord($2))
|
| 174 |
}eog;
|
| 175 |
$str;
|
| 176 |
}
|
| 177 |
|
| 178 |
#------------------------------
|
| 179 |
|
| 180 |
=item decode_mimewords ENCODED, [OPTS...]
|
| 181 |
|
| 182 |
I<Function.>
|
| 183 |
Go through the string looking for RFC-1522-style "Q"
|
| 184 |
(quoted-printable, sort of) or "B" (base64) encoding, and decode them.
|
| 185 |
|
| 186 |
B<In an array context,> splits the ENCODED string into a list of decoded
|
| 187 |
C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
|
| 188 |
data are returned in a 1-element array C<[DATA]>, giving an effective
|
| 189 |
CHARSET of C<undef>.
|
| 190 |
|
| 191 |
$enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
|
| 192 |
foreach (decode_mimewords($enc)) {
|
| 193 |
print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n";
|
| 194 |
}
|
| 195 |
|
| 196 |
B<In a scalar context,> joins the "data" elements of the above
|
| 197 |
list together, and returns that. I<Warning: this is information-lossy,>
|
| 198 |
and probably I<not> what you want, but if you know that all charsets
|
| 199 |
in the ENCODED string are identical, it might be useful to you.
|
| 200 |
(Before you use this, please see L<MIME::WordDecoder/unmime>,
|
| 201 |
which is probably what you want.)
|
| 202 |
B<Note>: See also "Charset" option below.
|
| 203 |
|
| 204 |
In the event of a syntax error, $@ will be set to a description
|
| 205 |
of the error, but parsing will continue as best as possible (so as to
|
| 206 |
get I<something> back when decoding headers).
|
| 207 |
$@ will be false if no error was detected.
|
| 208 |
|
| 209 |
Any arguments past the ENCODED string are taken to define a hash of options:
|
| 210 |
|
| 211 |
=over 4
|
| 212 |
|
| 213 |
=item Charset
|
| 214 |
|
| 215 |
B<Improvement by this module>:
|
| 216 |
Name of character set by which data elements in scalar context
|
| 217 |
will be converted.
|
| 218 |
If this option is specified as special value C<"_UNICODE_">,
|
| 219 |
returned value will be Unicode string.
|
| 220 |
|
| 221 |
When Unicode/multibyte support is disabled
|
| 222 |
(see L<MIME::Charset/USE_ENCODE>),
|
| 223 |
this option will not have any effects.
|
| 224 |
|
| 225 |
B<Note>:
|
| 226 |
This feature is still information-lossy, I<except> when C<"_UNICODE_"> is
|
| 227 |
specified.
|
| 228 |
|
| 229 |
=item Detect7bit
|
| 230 |
|
| 231 |
B<Improvement by this modlue>:
|
| 232 |
Try to detect 7-bit charset on unencoded portions.
|
| 233 |
Default is C<"YES">.
|
| 234 |
When Unicode/multibyte support is disabled,
|
| 235 |
this option will not have any effects
|
| 236 |
(see L<MIME::Charset/USE_ENCODE>).
|
| 237 |
B<Note>:
|
| 238 |
This feature was introduced at release 1.000.
|
| 239 |
|
| 240 |
=item Field
|
| 241 |
|
| 242 |
Name of the mail field this string came from. I<Currently ignored.>
|
| 243 |
|
| 244 |
=back
|
| 245 |
|
| 246 |
B<Improvement by this module>:
|
| 247 |
Adjacent encoded-words with same charset will be concatenated
|
| 248 |
to handle multibyte sequences safely.
|
| 249 |
|
| 250 |
B<Change by this module>:
|
| 251 |
Malformed base64 encoded-words will be kept encoded.
|
| 252 |
In this case $@ will be set.
|
| 253 |
|
| 254 |
B<Compatibility with MIME::Words>:
|
| 255 |
Whitespaces surrounding unencoded data will not be stripped.
|
| 256 |
|
| 257 |
=cut
|
| 258 |
|
| 259 |
sub decode_mimewords {
|
| 260 |
my $encstr = shift;
|
| 261 |
my %params = @_;
|
| 262 |
my $cset = $params{"Charset"};
|
| 263 |
my $detect7bit = uc($params{'Detect7bit'} || "YES");
|
| 264 |
my @tokens;
|
| 265 |
$@ = ''; ### error-return
|
| 266 |
|
| 267 |
### Decode:
|
| 268 |
my ($word, $charset, $encoding, $enc, $dec);
|
| 269 |
my $spc = '';
|
| 270 |
pos($encstr) = 0;
|
| 271 |
while (1) {
|
| 272 |
last if (pos($encstr) >= length($encstr));
|
| 273 |
my $pos = pos($encstr); ### save it
|
| 274 |
|
| 275 |
### Case 1: are we looking at "=?..?..?="?
|
| 276 |
if ($encstr =~ m{\G # from where we left off..
|
| 277 |
=\?([^?]*) # "=?" + charset +
|
| 278 |
\?([bq]) # "?" + encoding +
|
| 279 |
\?([^?]+) # "?" + data maybe with spcs +
|
| 280 |
\?= # "?="
|
| 281 |
([\r\n\t ]*)
|
| 282 |
}xgi) {
|
| 283 |
($word, $charset, $encoding, $enc) = ($&, $1, lc($2), $3);
|
| 284 |
my $tspc = $4;
|
| 285 |
if ($encoding eq 'q') {
|
| 286 |
$dec = _decode_Q($enc);
|
| 287 |
} else {
|
| 288 |
$dec = _decode_B($enc);
|
| 289 |
}
|
| 290 |
unless (defined $dec) {
|
| 291 |
$@ .= qq|Illegal sequence in "$word" (pos $pos)\n|;
|
| 292 |
push @tokens, [$spc.$word];
|
| 293 |
$spc = '';
|
| 294 |
next;
|
| 295 |
}
|
| 296 |
|
| 297 |
if (scalar(@tokens) and
|
| 298 |
lc($charset) eq lc(${$tokens[-1]}[1]) and
|
| 299 |
resolve_alias($charset)) { # Concat words if possible.
|
| 300 |
${$tokens[-1]}[0] .= $dec;
|
| 301 |
} else {
|
| 302 |
push @tokens, [$dec, $charset];
|
| 303 |
}
|
| 304 |
$spc = $tspc;
|
| 305 |
next;
|
| 306 |
}
|
| 307 |
|
| 308 |
### Case 2: are we looking at a bad "=?..." prefix?
|
| 309 |
### We need this to detect problems for case 3, which stops at "=?":
|
| 310 |
pos($encstr) = $pos; # reset the pointer.
|
| 311 |
if ($encstr =~ m{\G=\?}xg) {
|
| 312 |
$@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
|
| 313 |
push @tokens, [$spc.'=?'];
|
| 314 |
$spc = '';
|
| 315 |
next;
|
| 316 |
}
|
| 317 |
|
| 318 |
### Case 3: are we looking at ordinary text?
|
| 319 |
pos($encstr) = $pos; # reset the pointer.
|
| 320 |
if ($encstr =~ m{\G # from where we left off...
|
| 321 |
(.*? # shortest possible string,
|
| 322 |
\n*) # followed by 0 or more NLs,
|
| 323 |
(?=(\Z|=\?)) # terminated by "=?" or EOS
|
| 324 |
}xgs) {
|
| 325 |
length($1) or croak "MIME::EncWords: internal logic err: empty token\n";
|
| 326 |
push @tokens, [$spc.$1];
|
| 327 |
$spc = '';
|
| 328 |
next;
|
| 329 |
}
|
| 330 |
|
| 331 |
### Case 4: bug!
|
| 332 |
croak "MIME::EncWords: unexpected case:\n($encstr) pos $pos\n\t".
|
| 333 |
"Please alert developer.\n";
|
| 334 |
}
|
| 335 |
push @tokens, [$spc] if $spc;
|
| 336 |
|
| 337 |
# Detect 7-bit charset
|
| 338 |
if ($detect7bit ne "NO") {
|
| 339 |
foreach my $t (@tokens) {
|
| 340 |
unless ($t->[1]) {
|
| 341 |
my $charset = &MIME::Charset::_detect_7bit_charset($t->[0]);
|
| 342 |
if ($charset and $charset ne &MIME::Charset::default()) {
|
| 343 |
$t->[0] =~ s/[\r\n\t ]+/ /g;
|
| 344 |
$t->[1] = $charset;
|
| 345 |
}
|
| 346 |
}
|
| 347 |
}
|
| 348 |
}
|
| 349 |
|
| 350 |
return (wantarray ? @tokens : join('',map {
|
| 351 |
&_convert($_->[0], $_->[1], $cset)
|
| 352 |
} @tokens));
|
| 353 |
}
|
| 354 |
|
| 355 |
#------------------------------
|
| 356 |
|
| 357 |
# _convert RAW, FROMCHARSET, TOCHARSET
|
| 358 |
# Private: used by encode_mimewords() to convert string by other charset
|
| 359 |
# or to decode to Unicode.
|
| 360 |
# When source charset is unknown and Unicode string is requested, at first
|
| 361 |
# try well-formed UTF-8 then fallback to ISO-8859-1 so that almost all
|
| 362 |
# non-ASCII bytes will be preserved.
|
| 363 |
sub _convert($$$) {
|
| 364 |
my $s = shift;
|
| 365 |
my $charset = shift || "";
|
| 366 |
my $cset = shift;
|
| 367 |
return $s unless MIME::Charset::USE_ENCODE;
|
| 368 |
return $s unless $cset;
|
| 369 |
return $s if uc($charset) eq uc($cset);
|
| 370 |
|
| 371 |
my $preserveerr = $@;
|
| 372 |
|
| 373 |
my $converted = $s;
|
| 374 |
if (is_utf8($s) or $s =~ $WIDECHAR) {
|
| 375 |
if ($cset ne "_UNICODE_") {
|
| 376 |
$converted = encode($cset, $converted);
|
| 377 |
}
|
| 378 |
} elsif ($cset eq "_UNICODE_") {
|
| 379 |
if (!resolve_alias($charset)) {
|
| 380 |
if ($s =~ $UNSAFE) {
|
| 381 |
$@ = '';
|
| 382 |
eval {
|
| 383 |
$converted = decode("UTF-8", $converted, FB_CROAK());
|
| 384 |
};
|
| 385 |
if ($@) {
|
| 386 |
$converted = $s;
|
| 387 |
$converted = decode("ISO-8859-1", $converted);
|
| 388 |
}
|
| 389 |
}
|
| 390 |
} else {
|
| 391 |
$converted = decode($charset, $converted);
|
| 392 |
}
|
| 393 |
} elsif (resolve_alias($charset)) {
|
| 394 |
from_to($converted, $charset, $cset);
|
| 395 |
}
|
| 396 |
|
| 397 |
$@ = $preserveerr;
|
| 398 |
return $converted;
|
| 399 |
}
|
| 400 |
|
| 401 |
#------------------------------
|
| 402 |
|
| 403 |
=item encode_mimeword RAW, [ENCODING], [CHARSET]
|
| 404 |
|
| 405 |
I<Function.>
|
| 406 |
Encode a single RAW "word" that has unsafe characters.
|
| 407 |
The "word" will be encoded in its entirety.
|
| 408 |
|
| 409 |
### Encode "<<Franc,ois>>":
|
| 410 |
$encoded = encode_mimeword("\xABFran\xE7ois\xBB");
|
| 411 |
|
| 412 |
You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
|
| 413 |
B<Improvement by this module>:
|
| 414 |
You may also specify it as ``special'' value: C<"S"> to choose shorter
|
| 415 |
one of either C<"Q"> or C<"B">.
|
| 416 |
|
| 417 |
You may specify the CHARSET, which defaults to C<iso-8859-1>.
|
| 418 |
|
| 419 |
B<Change by this module>:
|
| 420 |
Spaces will be escaped with ``_'' by C<"Q"> encoding.
|
| 421 |
|
| 422 |
=cut
|
| 423 |
|
| 424 |
sub encode_mimeword {
|
| 425 |
my $word = shift;
|
| 426 |
my $encoding = uc(shift || 'Q');
|
| 427 |
my $charset = uc(shift || 'ISO-8859-1');
|
| 428 |
|
| 429 |
my $encstr;
|
| 430 |
if ($encoding eq 'Q') {
|
| 431 |
$encstr = &_encode_Q($word);
|
| 432 |
} elsif ($encoding eq "S") {
|
| 433 |
my ($B, $Q) = (&_encode_B($word), &_encode_Q($word));
|
| 434 |
if (length($B) < length($Q)) {
|
| 435 |
$encoding = "B";
|
| 436 |
$encstr = $B;
|
| 437 |
} else {
|
| 438 |
$encoding = "Q";
|
| 439 |
$encstr = $Q;
|
| 440 |
}
|
| 441 |
} else { # "B"
|
| 442 |
$encoding = "B";
|
| 443 |
$encstr = &_encode_B($word);
|
| 444 |
}
|
| 445 |
|
| 446 |
"=?$charset?$encoding?$encstr?=";
|
| 447 |
}
|
| 448 |
|
| 449 |
#------------------------------
|
| 450 |
|
| 451 |
=item encode_mimewords RAW, [OPTS]
|
| 452 |
|
| 453 |
I<Function.>
|
| 454 |
Given a RAW string, try to find and encode all "unsafe" sequences
|
| 455 |
of characters:
|
| 456 |
|
| 457 |
### Encode a string with some unsafe "words":
|
| 458 |
$encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
|
| 459 |
|
| 460 |
Returns the encoded string.
|
| 461 |
|
| 462 |
B<Improvement by this module>:
|
| 463 |
RAW may be a Unicode string when Unicode/multibyte support is enabled
|
| 464 |
(see L<MIME::Charset/USE_ENCODE>).
|
| 465 |
Furthermore, RAW may be a reference to that returned
|
| 466 |
by L<"decode_mimewords"> on array context. In latter case "Charset"
|
| 467 |
option (see below) will be overridden (see also notes below).
|
| 468 |
|
| 469 |
Any arguments past the RAW string are taken to define a hash of options:
|
| 470 |
|
| 471 |
=over 4
|
| 472 |
|
| 473 |
=item Charset
|
| 474 |
|
| 475 |
Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
|
| 476 |
a.k.a. "Latin-1".
|
| 477 |
|
| 478 |
=item Detect7bit
|
| 479 |
|
| 480 |
B<Improvement by this modlue>:
|
| 481 |
When "Encoding" option (see below) is specified as C<"a"> and "Charset"
|
| 482 |
option is unknown, try to detect 7-bit charset on given RAW string.
|
| 483 |
Default is C<"YES">.
|
| 484 |
When Unicode/multibyte support is disabled,
|
| 485 |
this option will not have any effects
|
| 486 |
(see L<MIME::Charset/USE_ENCODE>).
|
| 487 |
|
| 488 |
=item Encoding
|
| 489 |
|
| 490 |
The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
|
| 491 |
B<Improvement by this module>:
|
| 492 |
You may also specify ``special'' values: C<"a"> will automatically choose
|
| 493 |
recommended encoding to use (with charset conversion if alternative
|
| 494 |
charset is recommended: see L<MIME::Charset>);
|
| 495 |
C<"s"> will choose shorter one of either C<"q"> or C<"b">.
|
| 496 |
|
| 497 |
=item Field
|
| 498 |
|
| 499 |
Name of the mail field this string will be used in.
|
| 500 |
B<Improvement by this module>:
|
| 501 |
Length of mail field name will be considered in the first line of
|
| 502 |
encoded header.
|
| 503 |
|
| 504 |
=item Minimal
|
| 505 |
|
| 506 |
B<Improvement by this module>:
|
| 507 |
Takes care of natural word separators (i.e. whitespaces)
|
| 508 |
in the text to be encoded.
|
| 509 |
If C<"NO"> is specified, this module will encode whole text
|
| 510 |
(if encoding needed) not regarding whitespaces;
|
| 511 |
encoded-words exceeding line length will be splitted based only on their
|
| 512 |
lengths.
|
| 513 |
Default is C<"YES">.
|
| 514 |
|
| 515 |
B<Note>:
|
| 516 |
As of release 0.040, default has been changed to C<"YES"> to ensure
|
| 517 |
compatibility with MIME::Words.
|
| 518 |
On earlier releases, this option was fixed to be C<"NO">.
|
| 519 |
|
| 520 |
=item Replacement
|
| 521 |
|
| 522 |
B<Improvement by this module>:
|
| 523 |
See L<MIME::Charset/ERROR HANDLING>.
|
| 524 |
B<Note>:
|
| 525 |
This feature was introduced at release 1.000.
|
| 526 |
|
| 527 |
=back
|
| 528 |
|
| 529 |
B<Notes on improvement by this module>:
|
| 530 |
When RAW is an arrayref,
|
| 531 |
adjacent encoded-words are concatenated. Then they are splitted taking
|
| 532 |
care of character boundaries of multibyte sequences, when Unicode/multibyte
|
| 533 |
support is enabled.
|
| 534 |
Portions for unencoded data should include surrounding whitespace(s), or
|
| 535 |
they will be merged into adjoining encoded word(s).
|
| 536 |
|
| 537 |
=cut
|
| 538 |
|
| 539 |
sub encode_mimewords {
|
| 540 |
my $words = shift;
|
| 541 |
my %params = @_;
|
| 542 |
my $charset = uc($params{'Charset'});
|
| 543 |
my $detect7bit = uc($params{'Detect7bit'} || "YES");
|
| 544 |
my $encoding = uc($params{'Encoding'});
|
| 545 |
my $header_name = $params{'Field'};
|
| 546 |
my $minimal = uc($params{'Minimal'} || "YES");
|
| 547 |
my $replacement = uc($params{'Replacement'} || 'DEFAULT');
|
| 548 |
my $firstlinelen = $MAXLINELEN;
|
| 549 |
if ($header_name) {
|
| 550 |
$firstlinelen -= length($header_name.': ');
|
| 551 |
}
|
| 552 |
|
| 553 |
unless (ref($words) eq "ARRAY") {
|
| 554 |
if ($minimal eq "YES") {
|
| 555 |
my @words = map {[$_, $charset]} split(/((?:\A|[\t ])[\t \x21-\x7E]+(?:[\t ]|\Z))/, $words);
|
| 556 |
$words = \@words;
|
| 557 |
} else {
|
| 558 |
$words = [[$words, $charset]];
|
| 559 |
}
|
| 560 |
}
|
| 561 |
|
| 562 |
# Translate / concatenate words.
|
| 563 |
my @triplets;
|
| 564 |
foreach (@$words) {
|
| 565 |
my ($s, $cset) = @$_;
|
| 566 |
my $enc;
|
| 567 |
|
| 568 |
next unless length($s);
|
| 569 |
|
| 570 |
# Unicode string should be encoded by given charset.
|
| 571 |
# Unsupported charset will be fallbacked to UTF-8.
|
| 572 |
if (is_utf8($s) or $s =~ $WIDECHAR) {
|
| 573 |
unless (resolve_alias($cset)) {
|
| 574 |
if ($s !~ $UNSAFE) {
|
| 575 |
$cset = "US-ASCII";
|
| 576 |
} elsif ($replacement =~ '^(CROAK|STRICT)$') {
|
| 577 |
croak "MIME::EncWords: unsupported charset: $cset\n";
|
| 578 |
} else {
|
| 579 |
$cset = "UTF-8";
|
| 580 |
}
|
| 581 |
}
|
| 582 |
if ($replacement =~ /^(CROAK|STRICT)$/) {
|
| 583 |
$s = encode($cset, $s, FB_CROAK());
|
| 584 |
} else {
|
| 585 |
$s = encode($cset, $s);
|
| 586 |
}
|
| 587 |
}
|
| 588 |
|
| 589 |
# Determine charset and encoding.
|
| 590 |
if ($encoding eq "A") {
|
| 591 |
($s, $cset, $enc) =
|
| 592 |
header_encode($s, $cset || $charset,
|
| 593 |
Detect7bit => $detect7bit,
|
| 594 |
Replacement => $replacement);
|
| 595 |
} else {
|
| 596 |
$cset ||= ($charset || ($s !~ $UNSAFE)? "US-ASCII": "ISO-8859-1");
|
| 597 |
my @spec =
|
| 598 |
header_encode($s, $cset,
|
| 599 |
Detect7bit => $detect7bit,
|
| 600 |
Replacement => $replacement);
|
| 601 |
if ($spec[0] eq $s and $spec[1] eq "US-ASCII" and !$spec[2]) {
|
| 602 |
($cset, $enc) = ("US-ASCII", undef);
|
| 603 |
} else {
|
| 604 |
$enc = $encoding || "Q";
|
| 605 |
}
|
| 606 |
}
|
| 607 |
|
| 608 |
# Concatenate adjacent ``words'' so that multibyte sequences will
|
| 609 |
# be handled safely.
|
| 610 |
# Note: Encoded-word and unencoded text must not adjoin without
|
| 611 |
# separating whitespace(s).
|
| 612 |
if (scalar(@triplets)) {
|
| 613 |
my ($last, $lastenc, $lastcset) = @{$triplets[-1]};
|
| 614 |
if (uc($lastcset) eq uc($cset) and uc($lastenc) eq uc($enc) and
|
| 615 |
resolve_alias($cset)) {
|
| 616 |
${$triplets[-1]}[0] .= $s;
|
| 617 |
next;
|
| 618 |
} elsif (!$lastenc and $enc and $last !~ /[\t ]$/) {
|
| 619 |
if ($last =~ /^(.*)[\t ]([$PRINTABLE]+)$/s) {
|
| 620 |
${$triplets[-1]}[0] = $1." ";
|
| 621 |
$s = $2.$s;
|
| 622 |
} elsif (uc($lastcset) eq "US-ASCII") {
|
| 623 |
${$triplets[-1]}[0] .= $s;
|
| 624 |
${$triplets[-1]}[1] = $enc;
|
| 625 |
${$triplets[-1]}[2] = $cset;
|
| 626 |
next;
|
| 627 |
}
|
| 628 |
} elsif ($lastenc and !$enc and $s !~ /^[\t ]/) {
|
| 629 |
if ($s =~ /^([$PRINTABLE]+)[\t ](.*)$/s) {
|
| 630 |
${$triplets[-1]}[0] .= $1;
|
| 631 |
$s = " ".$2;
|
| 632 |
} elsif (uc($cset) eq "US-ASCII") {
|
| 633 |
${$triplets[-1]}[0] .= $s;
|
| 634 |
next;
|
| 635 |
}
|
| 636 |
}
|
| 637 |
}
|
| 638 |
push @triplets, [$s, $enc, $cset];
|
| 639 |
}
|
| 640 |
|
| 641 |
# Split long ``words''.
|
| 642 |
my @splitted;
|
| 643 |
my $restlen = $firstlinelen;
|
| 644 |
my $lastlen = 0;
|
| 645 |
foreach (@triplets) {
|
| 646 |
my ($s, $enc, $cset) = @$_;
|
| 647 |
|
| 648 |
my $restlen = $restlen - $lastlen - 1;
|
| 649 |
if ($restlen < ($enc? encoded_header_len('', $enc, $cset): 1)) {
|
| 650 |
$restlen = $MAXLINELEN - 1;
|
| 651 |
}
|
| 652 |
|
| 653 |
push @splitted, &_split($s, $enc, $cset, $restlen);
|
| 654 |
my ($last, $lastenc, $lastcset) = @{$splitted[-1]};
|
| 655 |
if ($lastenc) {
|
| 656 |
$lastlen = encoded_header_len($last, $lastenc, $lastcset);
|
| 657 |
} else {
|
| 658 |
$lastlen = length($last);
|
| 659 |
}
|
| 660 |
}
|
| 661 |
|
| 662 |
# Do encoding.
|
| 663 |
my @lines;
|
| 664 |
my $linelen = $firstlinelen;
|
| 665 |
foreach (@splitted) {
|
| 666 |
my ($str, $encoding, $charset) = @$_;
|
| 667 |
next unless length($str);
|
| 668 |
|
| 669 |
my $s;
|
| 670 |
if (!$encoding) {
|
| 671 |
$s = $str;
|
| 672 |
} else {
|
| 673 |
$s = &encode_mimeword($str, $encoding, $charset);
|
| 674 |
}
|
| 675 |
|
| 676 |
my $spc = (scalar(@lines) and $lines[-1] =~ /[\t ]$/)? '': ' ';
|
| 677 |
if (!scalar(@lines)) {
|
| 678 |
$s =~ s/^[\r\n\t ]+//;
|
| 679 |
push @lines, $s;
|
| 680 |
} elsif (length($lines[-1]) + length($s) <= $linelen) {
|
| 681 |
$lines[-1] .= $spc.$s;
|
| 682 |
} else {
|
| 683 |
$s =~ s/^[\r\n\t ]+//;
|
| 684 |
push @lines, $s;
|
| 685 |
$linelen = $MAXLINELEN - 1;
|
| 686 |
}
|
| 687 |
}
|
| 688 |
|
| 689 |
join("\n ", @lines);
|
| 690 |
}
|
| 691 |
|
| 692 |
#------------------------------
|
| 693 |
|
| 694 |
# _split RAW, ENCODING, CHARSET, ROOM_OF_FIRST_LINE
|
| 695 |
# Private: used by encode_mimewords() to split a string into
|
| 696 |
# (encoded or non-encoded) words.
|
| 697 |
# Returns an array of arrayrefs [SUBSTRING, ENCODING, CHARSET].
|
| 698 |
sub _split {
|
| 699 |
my $str = shift;
|
| 700 |
my $encoding = shift;
|
| 701 |
my $charset = shift;
|
| 702 |
my $restlen = shift;
|
| 703 |
|
| 704 |
if (!$charset or $charset eq '8BIT') { # Undecodable.
|
| 705 |
$str =~ s/[\r\n]+[\t ]*|\x00/ /g; # Eliminate hostile characters.
|
| 706 |
return ([$str, undef, $charset]);
|
| 707 |
}
|
| 708 |
unless (resolve_alias($charset)) { # Unsupported charset.
|
| 709 |
return ([$str, $encoding, $charset]);
|
| 710 |
}
|
| 711 |
if (!$encoding and $charset eq 'US-ASCII') {
|
| 712 |
return &_split_ascii($str, $restlen);
|
| 713 |
}
|
| 714 |
|
| 715 |
my (@splitted, $ustr, $first);
|
| 716 |
while (length($str)) {
|
| 717 |
if (encoded_header_len($str, $encoding, $charset) <= $restlen) {
|
| 718 |
push @splitted, [$str, $encoding, $charset];
|
| 719 |
last;
|
| 720 |
}
|
| 721 |
$ustr = $str;
|
| 722 |
$ustr = decode($charset, $ustr);
|
| 723 |
($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, $restlen);
|
| 724 |
push @splitted, [$first, $encoding, $charset];
|
| 725 |
$restlen = $MAXLINELEN - 1;
|
| 726 |
}
|
| 727 |
return @splitted;
|
| 728 |
}
|
| 729 |
|
| 730 |
# _split_ascii RAW, ROOM_OF_FIRST_LINE
|
| 731 |
# Private: used by encode_mimewords() to split an US-ASCII string into
|
| 732 |
# (encoded or non-encoded) words.
|
| 733 |
# Returns an array of arrayrefs [SUBSTRING, ENCODING, "US-ASCII"],
|
| 734 |
# where ENCODING is either undef or (if any unsafe sequences are
|
| 735 |
# included) "Q".
|
| 736 |
sub _split_ascii {
|
| 737 |
my $s = shift;
|
| 738 |
my $restlen = shift || $MAXLINELEN - 1;
|
| 739 |
|
| 740 |
my @splitted;
|
| 741 |
foreach my $line (split(/[\r\n]+/, $s)) {
|
| 742 |
$line =~ s/^[\t ]+//;
|
| 743 |
|
| 744 |
if (length($line) < $restlen and $line !~ /=\?|$UNSAFE/) {
|
| 745 |
push @splitted, [$line, undef, "US-ASCII"];
|
| 746 |
$restlen = $MAXLINELEN - 1;
|
| 747 |
next;
|
| 748 |
}
|
| 749 |
|
| 750 |
my ($spc, $enc);
|
| 751 |
foreach my $word (split(/([\t ]+)/, $line)) {
|
| 752 |
if ($word =~ /[\t ]/) {
|
| 753 |
$spc = $word;
|
| 754 |
next;
|
| 755 |
}
|
| 756 |
|
| 757 |
$enc = ($word =~ /=\?|$UNSAFE/)? "Q": undef;
|
| 758 |
if (scalar(@splitted)) {
|
| 759 |
my ($last, $lastenc, $lastcset) = @{$splitted[-1]};
|
| 760 |
my ($elen, $cont, $appe);
|
| 761 |
|
| 762 |
# Concatenate adjacent words so that encoded-word and
|
| 763 |
# unencoded text will adjoin with separating whitespace.
|
| 764 |
if (!$lastenc and !$enc) {
|
| 765 |
$elen = length($spc.$word);
|
| 766 |
($cont, $appe) = ($spc.$word, "");
|
| 767 |
} elsif (!$lastenc and $enc) {
|
| 768 |
$elen = length($spc) +
|
| 769 |
encoded_header_len($word, "Q", "US-ASCII");
|
| 770 |
($cont, $appe) = ($spc, $word);
|
| 771 |
} elsif ($lastenc and !$enc) {
|
| 772 |
$elen = length($spc.$word);
|
| 773 |
($cont, $appe) = ("", $spc.$word);
|
| 774 |
} else {
|
| 775 |
$elen = encoded_header_len($spc.$word, "Q",
|
| 776 |
"US-ASCII") - 15;
|
| 777 |
($cont, $appe) = ($spc.$word, "");
|
| 778 |
}
|
| 779 |
|
| 780 |
if ($elen <= $restlen) {
|
| 781 |
${$splitted[-1]}[0] .= $cont if length($cont);
|
| 782 |
push @splitted, [$appe, $enc, "US-ASCII"] if length($appe);
|
| 783 |
$restlen -= $elen;
|
| 784 |
next;
|
| 785 |
}
|
| 786 |
$restlen = $MAXLINELEN - 1;
|
| 787 |
}
|
| 788 |
push @splitted, [$word, $enc, "US-ASCII"];
|
| 789 |
$restlen -= ($enc?
|
| 790 |
encoded_header_len($word, "Q", "US-ASCII"):
|
| 791 |
length($word));
|
| 792 |
}
|
| 793 |
}
|
| 794 |
return @splitted;
|
| 795 |
}
|
| 796 |
|
| 797 |
# _clip_unsafe UNICODE, ENCODING, CHARSET, ROOM_OF_FIRST_LINE
|
| 798 |
# Private: used by encode_mimewords() to bite off one encodable
|
| 799 |
# ``word'' from a Unicode string.
|
| 800 |
sub _clip_unsafe {
|
| 801 |
my $ustr = shift;
|
| 802 |
my $encoding = shift;
|
| 803 |
my $charset = shift;
|
| 804 |
my $restlen = shift;
|
| 805 |
return ("", "") unless length($ustr);
|
| 806 |
|
| 807 |
# Seek maximal division point.
|
| 808 |
my ($shorter, $longer) = (0, length($ustr));
|
| 809 |
while ($shorter < $longer) {
|
| 810 |
my $cur = int(($shorter + $longer + 1) / 2);
|
| 811 |
my $enc = encode($charset, substr($ustr, 0, $cur));
|
| 812 |
my $elen = encoded_header_len($enc, $encoding, $charset);
|
| 813 |
if ($elen <= $restlen) {
|
| 814 |
$shorter = $cur;
|
| 815 |
} else {
|
| 816 |
$longer = $cur - 1;
|
| 817 |
}
|
| 818 |
}
|
| 819 |
|
| 820 |
# Make sure that combined characters won't be divided.
|
| 821 |
my ($fenc, $renc);
|
| 822 |
my $max = length($ustr);
|
| 823 |
while (1) {
|
| 824 |
$@ = '';
|
| 825 |
eval {
|
| 826 |
($fenc, $renc) =
|
| 827 |
(substr($ustr, 0, $shorter), substr($ustr, $shorter));
|
| 828 |
$fenc = encode($charset, $fenc, FB_CROAK());
|
| 829 |
$renc = encode($charset, $renc, FB_CROAK());
|
| 830 |
};
|
| 831 |
last unless ($@);
|
| 832 |
|
| 833 |
$shorter++;
|
| 834 |
unless ($shorter < $max) { # Unencodable characters are included.
|
| 835 |
return (encode($charset, $ustr), "");
|
| 836 |
}
|
| 837 |
}
|
| 838 |
|
| 839 |
if (length($fenc)) {
|
| 840 |
return ($fenc, $renc);
|
| 841 |
} else {
|
| 842 |
return ($renc, "");
|
| 843 |
}
|
| 844 |
}
|
| 845 |
|
| 846 |
#------------------------------
|
| 847 |
|
| 848 |
=back
|
| 849 |
|
| 850 |
=head1 VERSION
|
| 851 |
|
| 852 |
Consult $VERSION variable.
|
| 853 |
|
| 854 |
Development versions of this module may be found at
|
| 855 |
L<http://hatuka.nezumi.nu/repos/MIME-EncWords/>.
|
| 856 |
|
| 857 |
=head1 SEE ALSO
|
| 858 |
|
| 859 |
L<MIME::Charset>,
|
| 860 |
L<MIME::Tools>
|
| 861 |
|
| 862 |
=head1 AUTHORS
|
| 863 |
|
| 864 |
The original version of function decode_mimewords() is derived from
|
| 865 |
L<MIME::Words> module that was written by:
|
| 866 |
Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>).
|
| 867 |
David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com
|
| 868 |
|
| 869 |
Other stuff are rewritten or added by:
|
| 870 |
Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
|
| 871 |
|
| 872 |
All rights reserved. This program is free software; you can redistribute
|
| 873 |
it and/or modify it under the same terms as Perl itself.
|
| 874 |
|
| 875 |
=cut
|
| 876 |
|
| 877 |
1;
|