NAME
Alvis::Encoding - Perl extension for guessing and checking the encoding of documents.
SYNOPSIS
use Alvis::Encoding;
# Create a new instance
my $e=Alvis::Encoding->new();
if (!defined($e))
{
die "Instantiating Alvis::Encoding failed.";
}
# Check that a (decimal) character code is legal UTF-8
my $code=55;
if (!$e->code_is_utf8($code))
{
# The message will contain the position and the offending character's code
die $e->errmsg();
}
# Check that a text is legal UTF-8
my $text;
if (!$e->is_utf8($text))
{
# The message will contain the position and the offending character's code
die $e->errmsg();
}
# If you need to obtain the position (1..) and the offending character,
# pass a placeholder in a hash ref argument:
my %err=();
if (!$e->is_utf8($text,\%err))
{
my $position=$err{pos};
my $code=$err{code};
. . .
}
#
# Guess the encoding of a document given a guess for its type
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_encoding=$e->guess($text,$doc_type,$doc_sub_type);
if (!defined($doc_encoding))
{
die('Cannot guess. ' . $e->errmsg());
}
#
# Try converting a document to UTF-8 with only its type known
#
my $type_guesser=Alvis::Document::Type->new();
my ($doc_type,$doc_sub_type)=$type_guesser->guess($text);
my $doc_in_utf8=$e->try_to_convert_to_utf8($text,$doc_type,$doc_sub_type);
if (!defined($doc_in_utf8))
{
die('Cannot guess. ' . $e->errmsg());
}
# Try to guess what was meant
my @possibilities=$e->guess_typo_fixes('uft-8');
DESCRIPTION
A collection of methods for guessing, confirming and fixing the encoding of a document.
METHODS
new()
Options:
defaultDocType default type for a document. Default: text.
defaultDocSubType default sub type for a document. Default: html.
defaultEncoding default encoding for a document. Default: iso-8859-1.
code_is_utf8(decimal_code)
Returns 1 if the (decimal) character code is legal UTF-8.
is_utf8(text,err_hash_ref)
Returns 1 if all of the characters of $text are legal UTF-8 Else, returns 0 and sets an error message specifying the location (1..) of the first illegal character code If you wish to obtain the position and offending code, pass a hash ref ($err_hash_ref). The info is in $err_hash_ref->{pos} and $err_hash_ref->{code}.
guess(text,doc_type,doc_sub_type)
Guess the encoding of a document given a guess for its type (and subtype).
guess_and_convert(text,doc_type,doc_sub_type,target_encoding)
Tries to first guess the encoding of the document given a guess at its type and subtype, and then tries to convert it to $target_encoding.
convert(text,source_encoding,target_encoding)
Tries to convert $text from $source_encoding to $target_encoding.
guess_typo_fixes($typo)
Returns a set of guesses for the meant encoding in a case of an encoding name containing typos.
errmsg()
Returns a stack of error messages, if any. Empty string otherwise.
SEE ALSO
Alvis::Document::Type
AUTHOR
Kimmo Valtonen, <kimmo.valtonen@hiit.fi>
COPYRIGHT AND LICENSE
Copyright (C) 2006 by Kimmo Valtonen
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.