| 1 |
|
| 2 |
package MIME::Charset;
|
| 3 |
use 5.005;
|
| 4 |
|
| 5 |
=head1 NAME
|
| 6 |
|
| 7 |
MIME::Charset - Charset Informations for MIME
|
| 8 |
|
| 9 |
=head1 SYNOPSIS
|
| 10 |
|
| 11 |
Getting charset informations:
|
| 12 |
|
| 13 |
use MIME::Charset qw(:info);
|
| 14 |
|
| 15 |
$benc = body_encoding("iso-8859-2"); # "Q"
|
| 16 |
$cset = canonical_charset("ANSI X3.4-1968"); # "US-ASCII"
|
| 17 |
$henc = header_encoding("utf-8"); # "S"
|
| 18 |
$cset = output_charset("shift_jis"); # "ISO-2022-JP"
|
| 19 |
|
| 20 |
Translating text data:
|
| 21 |
|
| 22 |
use MIME::Charset qw(:trans);
|
| 23 |
|
| 24 |
($text, $charset, $encoding) =
|
| 25 |
header_encode(
|
| 26 |
"\xc9\xc2\xc5\xaa\xc0\xde\xc3\xef\xc5\xaa".
|
| 27 |
"\xc7\xd1\xca\xaa\xbd\xd0\xce\xcf\xb4\xef",
|
| 28 |
"euc-jp");
|
| 29 |
# ...returns (<converted>, "ISO-2022-JP", "B");
|
| 30 |
|
| 31 |
($text, $charset, $encoding) =
|
| 32 |
body_encode(
|
| 33 |
"Collectioneur path\xe9tiquement ".
|
| 34 |
"\xe9clectique de d\xe9chets",
|
| 35 |
"latin1");
|
| 36 |
# ...returns (<original>, "ISO-8859-1", "QUOTED-PRINTABLE");
|
| 37 |
|
| 38 |
$len = encoded_header_len(
|
| 39 |
"Perl\xe8\xa8\x80\xe8\xaa\x9e", "b", "utf-8"); # 28
|
| 40 |
|
| 41 |
Manipulating module defaults:
|
| 42 |
|
| 43 |
use MIME::Charset;
|
| 44 |
|
| 45 |
MIME::Charset::alias("csEUCKR", "euc-kr");
|
| 46 |
MIME::Charset::default("iso-8859-1");
|
| 47 |
MIME::Charset::fallback("us-ascii");
|
| 48 |
|
| 49 |
=head1 DESCRIPTION
|
| 50 |
|
| 51 |
MIME::Charset provides informations about character sets used for
|
| 52 |
MIME messages on Internet.
|
| 53 |
|
| 54 |
=head2 DEFINITIONS
|
| 55 |
|
| 56 |
The B<charset> is ``character set'' used in MIME to refer to a
|
| 57 |
method of converting a sequence of octets into a sequence of characters.
|
| 58 |
It includes both concepts of ``coded character set'' (CCS) and
|
| 59 |
``character encoding scheme'' (CES) of ISO/IEC.
|
| 60 |
|
| 61 |
The B<encoding> is that used in MIME to refer to a method of representing
|
| 62 |
a body part or a header body as sequence(s) of printable US-ASCII
|
| 63 |
characters.
|
| 64 |
|
| 65 |
=over 4
|
| 66 |
|
| 67 |
=cut
|
| 68 |
|
| 69 |
use strict;
|
| 70 |
use vars qw(@ISA $VERSION @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
| 71 |
use Exporter;
|
| 72 |
@ISA = qw(Exporter);
|
| 73 |
@EXPORT = qw(body_encoding canonical_charset header_encoding output_charset
|
| 74 |
body_encode encoded_header_len header_encode);
|
| 75 |
@EXPORT_OK = qw(alias default fallback recommended);
|
| 76 |
%EXPORT_TAGS = (
|
| 77 |
"info" => [qw(body_encoding header_encoding
|
| 78 |
canonical_charset output_charset)],
|
| 79 |
"trans" =>[ qw(body_encode encoded_header_len
|
| 80 |
header_encode)],
|
| 81 |
);
|
| 82 |
use Carp qw(croak);
|
| 83 |
|
| 84 |
use constant USE_ENCODE => ($] >= 5.008001)? 'Encode': '';
|
| 85 |
|
| 86 |
my @ENCODE_SUBS = qw(FB_CROAK FB_PERLQQ FB_HTMLCREF FB_XMLCREF
|
| 87 |
decode encode from_to is_utf8 resolve_alias);
|
| 88 |
if (USE_ENCODE) {
|
| 89 |
eval "use ".USE_ENCODE." \@ENCODE_SUBS;";
|
| 90 |
} else {
|
| 91 |
require MIME::Charset::_Compat;
|
| 92 |
for my $sub (@ENCODE_SUBS) {
|
| 93 |
no strict "refs";
|
| 94 |
*{$sub} = \&{"MIME::Charset::_Compat::$sub"};
|
| 95 |
}
|
| 96 |
}
|
| 97 |
|
| 98 |
$VERSION = '0.04.1';
|
| 99 |
|
| 100 |
######## Private Attributes ########
|
| 101 |
|
| 102 |
my $DEFAULT_CHARSET = 'US-ASCII';
|
| 103 |
my $FALLBACK_CHARSET = 'UTF-8';
|
| 104 |
|
| 105 |
# This table was borrwed from Python email package.
|
| 106 |
|
| 107 |
my %CHARSETS = (# input header enc body enc output conv
|
| 108 |
'ISO-8859-1' => ['Q', 'Q', undef],
|
| 109 |
'ISO-8859-2' => ['Q', 'Q', undef],
|
| 110 |
'ISO-8859-3' => ['Q', 'Q', undef],
|
| 111 |
'ISO-8859-4' => ['Q', 'Q', undef],
|
| 112 |
# ISO-8859-5 is Cyrillic, and not especially used
|
| 113 |
# ISO-8859-6 is Arabic, also not particularly used
|
| 114 |
# ISO-8859-7 is Greek, 'Q' will not make it readable
|
| 115 |
# ISO-8859-8 is Hebrew, 'Q' will not make it readable
|
| 116 |
'ISO-8859-9' => ['Q', 'Q', undef],
|
| 117 |
'ISO-8859-10' => ['Q', 'Q', undef],
|
| 118 |
# ISO-8859-11 is Thai, 'Q' will not make it readable
|
| 119 |
'ISO-8859-13' => ['Q', 'Q', undef],
|
| 120 |
'ISO-8859-14' => ['Q', 'Q', undef],
|
| 121 |
'ISO-8859-15' => ['Q', 'Q', undef],
|
| 122 |
'WINDOWS-1252' => ['Q', 'Q', undef],
|
| 123 |
'VISCII' => ['Q', 'Q', undef],
|
| 124 |
'US-ASCII' => [undef, undef, undef],
|
| 125 |
'BIG5' => ['B', 'B', undef],
|
| 126 |
'GB2312' => ['B', 'B', undef],
|
| 127 |
'EUC-JP' => ['B', undef, 'ISO-2022-JP'],
|
| 128 |
'SHIFT_JIS' => ['B', undef, 'ISO-2022-JP'],
|
| 129 |
'ISO-2022-JP' => ['B', undef, undef],
|
| 130 |
'KOI8-R' => ['B', 'B', undef],
|
| 131 |
'UTF-8' => ['S', 'B', undef],
|
| 132 |
# We're making this one up to represent raw unencoded 8bit
|
| 133 |
'8BIT' => [undef, 'B', 'ISO-8859-1'],
|
| 134 |
);
|
| 135 |
|
| 136 |
# Fix some unexpected or unpreferred names returned by
|
| 137 |
# Encode::resolve_alias() or used by somebodies else.
|
| 138 |
my %CHARSET_ALIASES = (# unpreferred preferred
|
| 139 |
"ASCII" => "US-ASCII",
|
| 140 |
"BIG5-ETEN" => "BIG5",
|
| 141 |
"CP1251" => "WINDOWS-1251",
|
| 142 |
"CP1252" => "WINDOWS-1252",
|
| 143 |
"CP936" => "GBK",
|
| 144 |
"CP949" => "KS_C_5601-1987",
|
| 145 |
"EUC-CN" => "GB2312",
|
| 146 |
"KS_C_5601" => "KS_C_5601-1987",
|
| 147 |
"SHIFTJIS" => "SHIFT_JIS",
|
| 148 |
"SHIFTJISX0213" => "SHIFT_JISX0213",
|
| 149 |
"UNICODE-1-1-UTF-7" => "UTF-7",
|
| 150 |
"UTF8" => "UTF-8",
|
| 151 |
);
|
| 152 |
|
| 153 |
# ISO-2022-* escape sequnces to detect charset from unencoded data.
|
| 154 |
my @ISO2022_SEQ = (# escape seq possible charset
|
| 155 |
# Following sequences are commonly used.
|
| 156 |
["\033\$\@", "ISO-2022-JP"], # RFC 1468
|
| 157 |
["\033\$B", "ISO-2022-JP"], # ditto
|
| 158 |
["\033(J", "ISO-2022-JP"], # ditto
|
| 159 |
["\033(I", "ISO-2022-JP"], # ditto (nonstandard)
|
| 160 |
["\033\$(D", "ISO-2022-JP"], # RFC 2237 (note*)
|
| 161 |
# Folloing sequences are less commonly used.
|
| 162 |
["\033\$)C", "ISO-2022-KR"], # RFC 1557
|
| 163 |
["\033\$)A", "ISO-2022-CN"], # RFC 1922
|
| 164 |
["\033\$A", "ISO-2022-CN"], # ditto (nonstandard)
|
| 165 |
["\033\$)G", "ISO-2022-CN"], # ditto
|
| 166 |
["\033\$*H", "ISO-2022-CN"], # ditto
|
| 167 |
# Other sequences will be used with appropriate charset
|
| 168 |
# parameters, or hardly used.
|
| 169 |
);
|
| 170 |
|
| 171 |
# note*: This RFC defines ISO-2022-JP-1, superset of
|
| 172 |
# ISO-2022-JP. But that charset name is rarely used.
|
| 173 |
# OTOH many of codecs for ISO-2022-JP recognize this
|
| 174 |
# sequence so that comatibility with EUC-JP will be
|
| 175 |
# guaranteed.
|
| 176 |
|
| 177 |
######## Private Constants ########
|
| 178 |
|
| 179 |
my $NONASCIIRE = qr{
|
| 180 |
[^\x01-\x7e]
|
| 181 |
}x;
|
| 182 |
|
| 183 |
my $ISO2022RE = qr{
|
| 184 |
^ISO-2022-
|
| 185 |
}ix;
|
| 186 |
|
| 187 |
|
| 188 |
######## Public Functions ########
|
| 189 |
|
| 190 |
=head2 GETTING INFORMATIONS OF CHARSETS
|
| 191 |
|
| 192 |
=item body_encoding CHARSET
|
| 193 |
|
| 194 |
Get recommended transfer-encoding of CHARSET for message body.
|
| 195 |
|
| 196 |
Returned value will be one of C<"B"> (BASE64), C<"Q"> (QUOTED-PRINTABLE) or
|
| 197 |
C<undef> (might not be transfer-encoded; either 7BIT or 8BIT). This may
|
| 198 |
not be same as encoding for message header.
|
| 199 |
|
| 200 |
=cut
|
| 201 |
|
| 202 |
sub body_encoding($) {
|
| 203 |
my $charset = shift;
|
| 204 |
return undef unless $charset;
|
| 205 |
return (&recommended($charset))[1];
|
| 206 |
}
|
| 207 |
|
| 208 |
=item canonical_charset CHARSET
|
| 209 |
|
| 210 |
Get canonical name for charset CHARSET.
|
| 211 |
|
| 212 |
=cut
|
| 213 |
|
| 214 |
sub canonical_charset($) {
|
| 215 |
my $charset = shift;
|
| 216 |
return undef unless $charset;
|
| 217 |
my $cset = resolve_alias($charset) || $charset;
|
| 218 |
return $CHARSET_ALIASES{uc($cset)} || uc($cset);
|
| 219 |
}
|
| 220 |
|
| 221 |
=item header_encoding CHARSET
|
| 222 |
|
| 223 |
Get recommended encoding scheme of CHARSET for message header.
|
| 224 |
|
| 225 |
Returned value will be one of C<"B">, C<"Q">, C<"S"> (shorter one of either)
|
| 226 |
or C<undef> (might not be encoded). This may not be same as encoding
|
| 227 |
for message body.
|
| 228 |
|
| 229 |
=cut
|
| 230 |
|
| 231 |
sub header_encoding($) {
|
| 232 |
my $charset = shift;
|
| 233 |
return undef unless $charset;
|
| 234 |
return (&recommended($charset))[0];
|
| 235 |
}
|
| 236 |
|
| 237 |
=item output_charset CHARSET
|
| 238 |
|
| 239 |
Get a charset which is compatible with given CHARSET and is recommended
|
| 240 |
to be used for MIME messages on Internet (if it is known by this module).
|
| 241 |
|
| 242 |
When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
|
| 243 |
this function will simply
|
| 244 |
return the result of L<"canonical_charset">.
|
| 245 |
|
| 246 |
=cut
|
| 247 |
|
| 248 |
sub output_charset($) {
|
| 249 |
my $charset = shift;
|
| 250 |
return undef unless $charset;
|
| 251 |
return (&recommended($charset))[2] || uc($charset);
|
| 252 |
}
|
| 253 |
|
| 254 |
=head2 TRANSLATING TEXT DATA
|
| 255 |
|
| 256 |
=item body_encode STRING, CHARSET [, OPTS]
|
| 257 |
|
| 258 |
Get converted (if needed) data of STRING and recommended transfer-encoding
|
| 259 |
of that data for message body. CHARSET is the charset by which STRING
|
| 260 |
is encoded.
|
| 261 |
|
| 262 |
OPTS may accept following key-value pairs.
|
| 263 |
B<NOTE>:
|
| 264 |
When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
|
| 265 |
conversion will not be performed. So these options do not have any effects.
|
| 266 |
|
| 267 |
=over 4
|
| 268 |
|
| 269 |
=item Replacement => REPLACEMENT
|
| 270 |
|
| 271 |
Specifies error handling scheme. See L<"ERROR HANDLING">.
|
| 272 |
|
| 273 |
=item Detect7bit => YESNO
|
| 274 |
|
| 275 |
Try auto-detecting 7-bit charset when CHARSET is not given.
|
| 276 |
Default is C<"YES">.
|
| 277 |
|
| 278 |
=back
|
| 279 |
|
| 280 |
3-item list of (I<converted string>, I<charset for output>,
|
| 281 |
I<transfer-encoding>) will be returned.
|
| 282 |
I<Transfer-encoding> will be either C<"BASE64">, C<"QUOTED-PRINTABLE">,
|
| 283 |
C<"7BIT"> or C<"8BIT">. If I<charset for output> could not be determined
|
| 284 |
and I<converted string> contains non-ASCII byte(s), I<charset for output> will
|
| 285 |
be C<undef> and I<transfer-encoding> will be C<"BASE64">.
|
| 286 |
I<Charset for output> will be C<"US-ASCII"> if and only if string does not
|
| 287 |
contain any non-ASCII bytes.
|
| 288 |
|
| 289 |
=cut
|
| 290 |
|
| 291 |
sub body_encode {
|
| 292 |
my ($encoded, $charset, $cset) = &_text_encode(@_);
|
| 293 |
|
| 294 |
# Determine transfer-encoding.
|
| 295 |
my $enc;
|
| 296 |
my $dummy = $encoded;
|
| 297 |
eval {
|
| 298 |
from_to($dummy, $cset, "US-ASCII", FB_CROAK());
|
| 299 |
};
|
| 300 |
if (!$@ and $dummy eq $encoded) {
|
| 301 |
$cset = "US-ASCII";
|
| 302 |
$enc = undef;
|
| 303 |
} else {
|
| 304 |
$@ = '';
|
| 305 |
$enc = &body_encoding($charset);
|
| 306 |
}
|
| 307 |
|
| 308 |
if (!$enc and $encoded !~ /\x00/) { # Eliminate hostile NUL character.
|
| 309 |
if ($encoded =~ $NONASCIIRE) { # String contains 8bit char(s).
|
| 310 |
$enc = '8BIT';
|
| 311 |
} elsif ($cset =~ $ISO2022RE) { # ISO-2022-* outputs are 7BIT.
|
| 312 |
$enc = '7BIT';
|
| 313 |
} else { # Pure ASCII.
|
| 314 |
$enc = '7BIT';
|
| 315 |
$cset = 'US-ASCII';
|
| 316 |
}
|
| 317 |
} elsif ($enc eq 'B') {
|
| 318 |
$enc = 'BASE64';
|
| 319 |
} elsif ($enc eq 'Q') {
|
| 320 |
$enc = 'QUOTED-PRINTABLE';
|
| 321 |
} else {
|
| 322 |
$enc = 'BASE64';
|
| 323 |
}
|
| 324 |
return ($encoded, $cset, $enc);
|
| 325 |
}
|
| 326 |
|
| 327 |
=item encoded_header_len STRING, ENCODING, CHARSET
|
| 328 |
|
| 329 |
Get length of encoded STRING for message header
|
| 330 |
(without folding).
|
| 331 |
|
| 332 |
ENCODING may be one of C<"B">, C<"Q"> or C<"S"> (shorter
|
| 333 |
one of either C<"B"> or C<"Q">).
|
| 334 |
|
| 335 |
=cut
|
| 336 |
|
| 337 |
sub encoded_header_len($$$) {
|
| 338 |
my $s = shift;
|
| 339 |
my $encoding = uc(shift);
|
| 340 |
my $charset = shift;
|
| 341 |
|
| 342 |
my $enclen;
|
| 343 |
if ($encoding eq 'Q') {
|
| 344 |
$enclen = _enclen_Q($s);
|
| 345 |
} elsif ($encoding eq "S") {
|
| 346 |
my ($b, $q) = (_enclen_B($s), _enclen_Q($s));
|
| 347 |
$enclen = ($b < $q)? $b: $q;
|
| 348 |
} else { # "B"
|
| 349 |
$enclen = _enclen_B($s);
|
| 350 |
}
|
| 351 |
|
| 352 |
length($charset)+$enclen+7;
|
| 353 |
}
|
| 354 |
|
| 355 |
sub _enclen_B($) {
|
| 356 |
int((length(shift) + 2) / 3) * 4;
|
| 357 |
}
|
| 358 |
|
| 359 |
sub _enclen_Q($) {
|
| 360 |
my $s = shift;
|
| 361 |
my @o;
|
| 362 |
@o = ($s =~ /(\?|=|_|[^ \x21-\x7e])/gos);
|
| 363 |
length($s) + scalar(@o) * 2;
|
| 364 |
}
|
| 365 |
|
| 366 |
=item header_encode STRING, CHARSET [, OPTS]
|
| 367 |
|
| 368 |
Get converted (if needed) data of STRING and recommended encoding scheme of
|
| 369 |
that data for message headers. CHARSET is the charset by which STRING
|
| 370 |
is encoded.
|
| 371 |
|
| 372 |
OPTS may accept following key-value pairs.
|
| 373 |
B<NOTE>:
|
| 374 |
When Unicode/multibyte support is disabled (see L<"USE_ENCODE">),
|
| 375 |
conversion will not be performed. So these options do not have any effects.
|
| 376 |
|
| 377 |
=over 4
|
| 378 |
|
| 379 |
=item Replacement => REPLACEMENT
|
| 380 |
|
| 381 |
Specifies error handling scheme. See L<"ERROR HANDLING">.
|
| 382 |
|
| 383 |
=item Detect7bit => YESNO
|
| 384 |
|
| 385 |
Try auto-detecting 7-bit charset when CHARSET is not given.
|
| 386 |
Default is C<"YES">.
|
| 387 |
|
| 388 |
=back
|
| 389 |
|
| 390 |
3-item list of (I<converted string>, I<charset for output>,
|
| 391 |
I<encoding scheme>) will be returned. I<Encoding scheme> will be
|
| 392 |
either C<"B">, C<"Q"> or C<undef> (might not be encoded).
|
| 393 |
If I<charset for output> could not be determined and I<converted string>
|
| 394 |
contains non-ASCII byte(s), I<charset for output> will be C<"8BIT">
|
| 395 |
(this is I<not> charset name but a special value to represent unencodable
|
| 396 |
data) and I<encoding scheme> will be C<undef> (should not be encoded).
|
| 397 |
I<Charset for output> will be C<"US-ASCII"> if and only if string does not
|
| 398 |
contain any non-ASCII bytes.
|
| 399 |
|
| 400 |
=back
|
| 401 |
|
| 402 |
=cut
|
| 403 |
|
| 404 |
sub header_encode {
|
| 405 |
my ($encoded, $charset, $cset) = &_text_encode(@_);
|
| 406 |
return ($encoded, '8BIT', undef) unless $cset;
|
| 407 |
|
| 408 |
# Determine encoding scheme.
|
| 409 |
my $enc;
|
| 410 |
my $dummy = $encoded;
|
| 411 |
eval {
|
| 412 |
from_to($dummy, $cset, "US-ASCII", FB_CROAK());
|
| 413 |
};
|
| 414 |
if (!$@ and $dummy eq $encoded) {
|
| 415 |
$cset = "US-ASCII";
|
| 416 |
$enc = undef;
|
| 417 |
} else {
|
| 418 |
$@ = '';
|
| 419 |
$enc = &header_encoding($charset);
|
| 420 |
}
|
| 421 |
|
| 422 |
if (!$enc and $encoded !~ $NONASCIIRE) {
|
| 423 |
unless ($cset =~ $ISO2022RE) { # ISO-2022-* outputs are 7BIT.
|
| 424 |
$cset = 'US-ASCII';
|
| 425 |
}
|
| 426 |
} elsif ($enc eq 'S') {
|
| 427 |
if (&encoded_header_len($encoded, "B", $cset) <
|
| 428 |
&encoded_header_len($encoded, "Q", $cset)) {
|
| 429 |
$enc = 'B';
|
| 430 |
} else {
|
| 431 |
$enc = 'Q';
|
| 432 |
}
|
| 433 |
} elsif ($enc !~ /^[BQ]$/) {
|
| 434 |
$enc = 'B';
|
| 435 |
}
|
| 436 |
return ($encoded, $cset, $enc);
|
| 437 |
}
|
| 438 |
|
| 439 |
sub _text_encode {
|
| 440 |
my $s = shift;
|
| 441 |
my $charset = &canonical_charset(shift);
|
| 442 |
my %params = @_;
|
| 443 |
my $replacement = uc($params{'Replacement'}) || "DEFAULT";
|
| 444 |
my $detect7bit = uc($params{'Detect7bit'}) || "YES";
|
| 445 |
|
| 446 |
if (!$charset) {
|
| 447 |
if ($s =~ $NONASCIIRE) {
|
| 448 |
return ($s, undef, undef);
|
| 449 |
} elsif ($detect7bit ne "NO") {
|
| 450 |
$charset = &_detect_7bit_charset($s);
|
| 451 |
} else {
|
| 452 |
$charset = $DEFAULT_CHARSET;
|
| 453 |
}
|
| 454 |
}
|
| 455 |
|
| 456 |
# Unknown charset.
|
| 457 |
return ($s, $charset, $charset)
|
| 458 |
unless resolve_alias($charset);
|
| 459 |
|
| 460 |
# Encode data by output charset if required. If failed, fallback to
|
| 461 |
# fallback charset.
|
| 462 |
my $cset = &output_charset($charset);
|
| 463 |
my $encoded;
|
| 464 |
|
| 465 |
if (is_utf8($s) or $s =~ /[^\x00-\xFF]/) {
|
| 466 |
if ($replacement =~ /^(?:CROAK|STRICT|FALLBACK)$/) {
|
| 467 |
eval {
|
| 468 |
$encoded = $s;
|
| 469 |
$encoded = encode($cset, $encoded, FB_CROAK());
|
| 470 |
};
|
| 471 |
if ($@) {
|
| 472 |
if ($replacement eq "FALLBACK" and $FALLBACK_CHARSET) {
|
| 473 |
$cset = $FALLBACK_CHARSET;
|
| 474 |
$encoded = $s;
|
| 475 |
$encoded = encode($cset, $encoded);
|
| 476 |
$charset = $cset;
|
| 477 |
} else {
|
| 478 |
croak $@;
|
| 479 |
}
|
| 480 |
}
|
| 481 |
} elsif ($replacement eq "PERLQQ") {
|
| 482 |
$encoded = encode($cset, $s, FB_PERLQQ());
|
| 483 |
} elsif ($replacement eq "HTMLCREF") {
|
| 484 |
$encoded = encode($cset, $s, FB_HTMLCREF());
|
| 485 |
} elsif ($replacement eq "XMLCREF") {
|
| 486 |
$encoded = encode($cset, $s, FB_XMLCREF());
|
| 487 |
} else {
|
| 488 |
$encoded = encode($cset, $s);
|
| 489 |
}
|
| 490 |
} elsif ($charset ne $cset) {
|
| 491 |
$encoded = $s;
|
| 492 |
if ($replacement =~ /^(?:CROAK|STRICT|FALLBACK)$/) {
|
| 493 |
eval {
|
| 494 |
from_to($encoded, $charset, $cset, FB_CROAK());
|
| 495 |
};
|
| 496 |
if ($@) {
|
| 497 |
if ($replacement eq "FALLBACK" and $FALLBACK_CHARSET) {
|
| 498 |
$cset = $FALLBACK_CHARSET;
|
| 499 |
$encoded = $s;
|
| 500 |
from_to($encoded, $charset, $cset);
|
| 501 |
$charset = $cset;
|
| 502 |
} else {
|
| 503 |
croak $@;
|
| 504 |
}
|
| 505 |
}
|
| 506 |
} elsif ($replacement eq "PERLQQ") {
|
| 507 |
from_to($encoded, $charset, $cset, FB_PERLQQ());
|
| 508 |
} elsif ($replacement eq "HTMLCREF") {
|
| 509 |
from_to($encoded, $charset, $cset, FB_HTMLCREF());
|
| 510 |
} elsif ($replacement eq "XMLCREF") {
|
| 511 |
from_to($encoded, $charset, $cset, FB_XMLCREF());
|
| 512 |
} else {
|
| 513 |
from_to($encoded, $charset, $cset);
|
| 514 |
}
|
| 515 |
} else {
|
| 516 |
$encoded = $s;
|
| 517 |
}
|
| 518 |
|
| 519 |
return ($encoded, $charset, $cset);
|
| 520 |
}
|
| 521 |
|
| 522 |
sub _detect_7bit_charset($) {
|
| 523 |
return $DEFAULT_CHARSET unless USE_ENCODE;
|
| 524 |
my $s = shift;
|
| 525 |
return $DEFAULT_CHARSET unless $s;
|
| 526 |
|
| 527 |
# Try to detect ISO-2022-* escape sequences.
|
| 528 |
foreach (@ISO2022_SEQ) {
|
| 529 |
my ($seq, $cset) = @$_;
|
| 530 |
if (index($s, $seq) >= 0) {
|
| 531 |
eval {
|
| 532 |
my $dummy = $s;
|
| 533 |
decode($cset, $dummy, FB_CROAK());
|
| 534 |
};
|
| 535 |
if ($@) {
|
| 536 |
next;
|
| 537 |
}
|
| 538 |
return $cset;
|
| 539 |
}
|
| 540 |
}
|
| 541 |
|
| 542 |
# How about HZ, VIQR, ...?
|
| 543 |
|
| 544 |
return $DEFAULT_CHARSET;
|
| 545 |
}
|
| 546 |
|
| 547 |
=head2 MANIPULATING MODULE DEFAULTS
|
| 548 |
|
| 549 |
=over 4
|
| 550 |
|
| 551 |
=item alias ALIAS [, CHARSET]
|
| 552 |
|
| 553 |
Get/set charset alias for canonical names determined by
|
| 554 |
L<"canonical_charset">.
|
| 555 |
|
| 556 |
If CHARSET is given and isn't false, ALIAS will be assigned as an alias of
|
| 557 |
CHARSET. Otherwise, alias won't be changed. In both cases,
|
| 558 |
current charset name that ALIAS is assigned will be returned.
|
| 559 |
|
| 560 |
=cut
|
| 561 |
|
| 562 |
sub alias ($;$) {
|
| 563 |
my $alias = uc(shift);
|
| 564 |
my $charset = uc(shift);
|
| 565 |
|
| 566 |
return $CHARSET_ALIASES{$alias} unless $charset;
|
| 567 |
|
| 568 |
$CHARSET_ALIASES{$alias} = $charset;
|
| 569 |
return $charset;
|
| 570 |
}
|
| 571 |
|
| 572 |
=item default [CHARSET]
|
| 573 |
|
| 574 |
Get/set default charset.
|
| 575 |
|
| 576 |
B<Default charset> is used by this module when charset context is
|
| 577 |
unknown. Modules using this module are recommended to use this
|
| 578 |
charset when charset context is unknown or implicit default is
|
| 579 |
expected. By default, it is C<"US-ASCII">.
|
| 580 |
|
| 581 |
If CHARSET is given and isn't false, it will be set to default charset.
|
| 582 |
Otherwise, default charset won't be changed. In both cases,
|
| 583 |
current default charset will be returned.
|
| 584 |
|
| 585 |
B<NOTE>: Default charset I<should not> be changed.
|
| 586 |
|
| 587 |
=cut
|
| 588 |
|
| 589 |
sub default(;$) {
|
| 590 |
my $charset = &canonical_charset(shift);
|
| 591 |
|
| 592 |
if ($charset) {
|
| 593 |
croak "Unknown charset '$charset'"
|
| 594 |
unless resolve_alias($charset);
|
| 595 |
$DEFAULT_CHARSET = $charset;
|
| 596 |
}
|
| 597 |
return $DEFAULT_CHARSET;
|
| 598 |
}
|
| 599 |
|
| 600 |
=item fallback [CHARSET]
|
| 601 |
|
| 602 |
Get/set fallback charset.
|
| 603 |
|
| 604 |
B<Fallback charset> is used by this module when conversion by given
|
| 605 |
charset is failed and C<"FALLBACK"> error handling scheme is specified.
|
| 606 |
Modules using this module may use this charset as last resort of charset
|
| 607 |
for conversion. By default, it is C<"UTF-8">.
|
| 608 |
|
| 609 |
If CHARSET is given and isn't false, it will be set to fallback charset.
|
| 610 |
If CHARSET is C<"NONE">, fallback charset will be undefined.
|
| 611 |
Otherwise, fallback charset won't be changed. In any cases,
|
| 612 |
current fallback charset will be returned.
|
| 613 |
|
| 614 |
B<NOTE>: It I<is> useful that C<"US-ASCII"> is specified as fallback charset,
|
| 615 |
since result of conversion will be readable without charset informations.
|
| 616 |
|
| 617 |
=cut
|
| 618 |
|
| 619 |
sub fallback(;$) {
|
| 620 |
my $charset = &canonical_charset(shift);
|
| 621 |
|
| 622 |
if ($charset eq "NONE") {
|
| 623 |
$FALLBACK_CHARSET = undef;
|
| 624 |
} elsif ($charset) {
|
| 625 |
croak "Unknown charset '$charset'"
|
| 626 |
unless resolve_alias($charset);
|
| 627 |
$FALLBACK_CHARSET = $charset;
|
| 628 |
}
|
| 629 |
return $FALLBACK_CHARSET;
|
| 630 |
}
|
| 631 |
|
| 632 |
=item recommended CHARSET [, HEADERENC, BODYENC [, ENCCHARSET]]
|
| 633 |
|
| 634 |
Get/set charset profiles.
|
| 635 |
|
| 636 |
If optional arguments are given and any of them are not false, profiles
|
| 637 |
for CHARSET will be set by those arguments. Otherwise, profiles
|
| 638 |
won't be changed. In both cases, current profiles for CHARSET will be
|
| 639 |
returned as 3-item list of (HEADERENC, BODYENC, ENCCHARSET).
|
| 640 |
|
| 641 |
HEADERENC is recommended encoding scheme for message header.
|
| 642 |
It may be one of C<"B">, C<"Q">, C<"S"> (shorter one of either) or
|
| 643 |
C<undef> (might not be encoded).
|
| 644 |
|
| 645 |
BODYENC is recommended transfer-encoding for message body. It may be
|
| 646 |
one of C<"B">, C<"Q"> or C<undef> (might not be transfer-encoded).
|
| 647 |
|
| 648 |
ENCCHARSET is a charset which is compatible with given CHARSET and
|
| 649 |
is recommended to be used for MIME messages on Internet.
|
| 650 |
If conversion is not needed (or this module doesn't know appropriate
|
| 651 |
charset), ENCCHARSET is C<undef>.
|
| 652 |
|
| 653 |
B<NOTE>: This function in the future releases can accept more optional
|
| 654 |
arguments (for example, properties to handle character widths, line folding
|
| 655 |
behavior, ...). So format of returned value may probably be changed.
|
| 656 |
Use L<"header_encoding">, L<"body_encoding"> or L<"output_charset"> to get
|
| 657 |
particular profile.
|
| 658 |
|
| 659 |
=cut
|
| 660 |
|
| 661 |
sub recommended ($;$;$;$) {
|
| 662 |
my $charset = &canonical_charset(shift);
|
| 663 |
my $henc = uc(shift) || undef;
|
| 664 |
my $benc = uc(shift) || undef;
|
| 665 |
my $cset = &canonical_charset(shift);
|
| 666 |
|
| 667 |
croak "CHARSET is not specified" unless $charset;
|
| 668 |
croak "Unknown header encoding" unless !$henc or $henc =~ /^[BQS]$/;
|
| 669 |
croak "Unknown body encoding" unless !$benc or $benc =~ /^[BQ]$/;
|
| 670 |
|
| 671 |
if ($henc or $benc or $cset) {
|
| 672 |
$cset = undef if $charset eq $cset;
|
| 673 |
my @spec = ($henc, $benc, USE_ENCODE? $cset: undef);
|
| 674 |
$CHARSETS{$charset} = \@spec;
|
| 675 |
return @spec;
|
| 676 |
} else {
|
| 677 |
my $spec = $CHARSETS{$charset};
|
| 678 |
if ($spec) {
|
| 679 |
return ($$spec[0], $$spec[1], USE_ENCODE? $$spec[2]: undef);
|
| 680 |
} else {
|
| 681 |
return ('S', 'B', undef);
|
| 682 |
}
|
| 683 |
}
|
| 684 |
}
|
| 685 |
|
| 686 |
=head2 CONSTANTS
|
| 687 |
|
| 688 |
=item USE_ENCODE
|
| 689 |
|
| 690 |
Unicode/multibyte support flag.
|
| 691 |
Non-null string will be set when Unicode and multibyte support is enabled.
|
| 692 |
Currently, this flag will be non-null on Perl 5.8.1 or later and
|
| 693 |
null string on earlier versions of Perl.
|
| 694 |
|
| 695 |
=head2 ERROR HANDLING
|
| 696 |
|
| 697 |
L<"body_encode"> and L<"header_encode"> accept following C<Replacement>
|
| 698 |
options:
|
| 699 |
|
| 700 |
=item C<"DEFAULT">
|
| 701 |
|
| 702 |
Put a substitution character in place of a malformed character.
|
| 703 |
For UCM-based encodings, <subchar> will be used.
|
| 704 |
|
| 705 |
=item C<"FALLBACK">
|
| 706 |
|
| 707 |
Try C<"DEFAULT"> scheme using I<fallback charset> (see L<"fallback">).
|
| 708 |
When fallback charset is undefined and conversion causes error,
|
| 709 |
code will die on error with an error message.
|
| 710 |
|
| 711 |
=item C<"CROAK">
|
| 712 |
|
| 713 |
Code will die on error immediately with an error message.
|
| 714 |
Therefore, you should trap the fatal error with eval{} unless you
|
| 715 |
really want to let it die on error.
|
| 716 |
Synonym is C<"STRICT">.
|
| 717 |
|
| 718 |
=item C<"PERQQ">
|
| 719 |
|
| 720 |
=item C<"HTMLCREF">
|
| 721 |
|
| 722 |
=item C<"XMLCREF">
|
| 723 |
|
| 724 |
Use C<FB_PERLQQ>, C<FB_HTMLCREF> or C<FB_XMLCREF>
|
| 725 |
scheme defined by L<Encode> module.
|
| 726 |
|
| 727 |
=back
|
| 728 |
|
| 729 |
If error handling scheme is not specified or unknown scheme is specified,
|
| 730 |
C<"DEFAULT"> will be assumed.
|
| 731 |
|
| 732 |
=head1 SEE ALSO
|
| 733 |
|
| 734 |
Multipurpose Internet Mail Extensions (MIME).
|
| 735 |
|
| 736 |
=head1 AUTHORS
|
| 737 |
|
| 738 |
Copyright (C) 2006 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
|
| 739 |
|
| 740 |
All rights reserved. This program is free software; you can redistribute
|
| 741 |
it and/or modify it under the same terms as Perl itself.
|
| 742 |
|
| 743 |
=cut
|
| 744 |
|
| 745 |
1;
|