Skip to content

Commit

Permalink
fix(Kalaclista::UserAgent): fix broken charset detection to html
Browse files Browse the repository at this point in the history
  • Loading branch information
nyarla committed Dec 21, 2023
1 parent 3a5f832 commit ddbd722
Showing 1 changed file with 21 additions and 21 deletions.
42 changes: 21 additions & 21 deletions lib/Kalaclista/UserAgent.pm
Original file line number Diff line number Diff line change
Expand Up @@ -30,44 +30,44 @@ class Kalaclista::UserAgent::Response {
method decoded_content {
return $decoded if defined $decoded;

my $name;
my $name = q{};
my $id = HTML5::DOM::Encoding->NOT_DETERMINED;

# in html
if ( $headers->{'content-type'} =~ m{html}i ) {
my $id = HTML5::DOM::Encoding->NOT_DETERMINED;

# detect encoding from content
# detect from content
my $content_type = $headers->{'content-type'};
if ( $content_type =~ m{html}i ) {
for my $method (qw<detectByPrescanStream detectByCharset detectBomAndCut detect>) {
my $func = HTML5::DOM::Encoding->can($method);
$id = $func->($content);
if ( $id != HTML5::DOM::Encoding->NOT_DETERMINED ) {
$name = HTML5::DOM::Encoding::id2name($name);
last;
}
}

# detect from content type
if ( HTML5::DOM::Encoding->NOT_DETERMINED ) {
my $fragment = qq{<meta http-equiv="content-type" content="@{[ $headers->{'content-type'} ]}">};
$id = HTML5::DOM::Encoding::detectByPrescanStream($fragment);
}

# force use utf8
$id = HTML5::DOM::Encoding->UTF_8;
$name = HTML5::DOM::Encoding::id2name($id);
}
elsif ( $headers->{'content-type'} =~ m{xml} ) {

# detect from xml header
elsif ( $content_type =~ m{xml}i ) {
$name = ( $content =~ m{encoding="([^"]+)"} )[0];
if ( !$name ) {
$name = ( $content =~ m{encoding='([^']+)'} )[0];
}
}

if ( !$name ) {
$name = 'UTF-8';
# detect from content-type header
if ( $name eq q{} ) {
my $fragment = qq{<meta http-equiv="content-type" content="@{[ $headers->{'content-type'} ]}">};
$id = HTML5::DOM::Encoding::detectByCharset($fragment);
if ( $id != HTML5::DOM::Encoding->NOT_DETERMINED ) {
$name = HTML5::DOM::Encoding::id2name($id);
}
}

# force fallback to UTF-8
if ( $name eq q{} ) {
$id = HTML5::DOM::Encoding->UTF_8;
$name = HTML5::DOM::Encoding::id2name($id);
}

# decode by encoding name
$decoded = Encode::decode( $name, $content );

return $decoded;
Expand Down

0 comments on commit ddbd722

Please sign in to comment.