diff --git a/MANIFEST b/MANIFEST index 679c4d06e..461bff6b2 100644 --- a/MANIFEST +++ b/MANIFEST @@ -26,6 +26,8 @@ lib/Zonemaster/Engine/Logger.pm lib/Zonemaster/Engine/Logger/Entry.pm lib/Zonemaster/Engine/Nameserver.pm lib/Zonemaster/Engine/Nameserver/Cache.pm +lib/Zonemaster/Engine/Normalization.pm +lib/Zonemaster/Engine/Normalization/Error.pm lib/Zonemaster/Engine/Net/IP.pm lib/Zonemaster/Engine/NSArray.pm lib/Zonemaster/Engine/Overview.pod @@ -74,6 +76,7 @@ t/nameserver-axfr.data t/nameserver-axfr.t t/nameserver.data t/nameserver.t +t/normalization.t t/old-bugs.data t/old-bugs.t t/pod-coverage.t diff --git a/lib/Zonemaster/Engine/Normalization.pm b/lib/Zonemaster/Engine/Normalization.pm new file mode 100644 index 000000000..b0bbf4324 --- /dev/null +++ b/lib/Zonemaster/Engine/Normalization.pm @@ -0,0 +1,214 @@ +package Zonemaster::Engine::Normalization; + +use 5.014002; + +use utf8; +use strict; +use warnings; + +use parent 'Exporter'; + +use Carp; +use Encode; +use Readonly; +use Try::Tiny; +use Zonemaster::LDNS; + +use Zonemaster::Engine::Normalization::Error; + + +=head1 NAME + +Zonemaster::Engine::Normalization - utility functions for names normalization + + +=head1 SYNOPSIS + + use Zonemaster::Engine::Normalization; + + my ($errors, $final_domain) = normalize_name($domain); + +=head1 EXPORTED FUNCTIONS + +=over +=cut + + +our @EXPORT = qw[ normalize_name ]; +our @EXPORT_OK = qw[ normalize_name normalize_label ]; + +Readonly my $ASCII => qr/^[[:ascii:]]+$/; +Readonly my $VALID_ASCII => qr(^[A-Za-z0-9/_-]+$); + +Readonly my $ASCII_FULL_STOP => "\x{002E}"; +Readonly my $ASCII_FULL_STOP_RE => qr/\x{002E}/; +Readonly my %FULL_STOPS => ( + FULLWIDTH_FULL_STOP => q/\x{FF0E}/, + IDEOGRAPHIC_FULL_STOP => q/\x{3002}/, + HALFWIDTH_IDEOGRAPHIC_FULL_STOP => q/\x{FF61}/ +); +Readonly my $FULL_STOPS_RE => (sub { + my $re = '[' . (join '', values %FULL_STOPS) . ']'; + return qr/$re/; +})->(); +Readonly my %WHITE_SPACES => ( + SPACE => q/\x{0020}/, + CHARACTER_TABULATION => q/\x{0009}/, + NO_BREAK_SPACE => q/\x{00A0}/, + EN_QUAD => q/\x{2000}/, + EM_QUAD => q/\x{2001}/, + EN_SPACE => q/\x{2002}/, + EM_SPACE => q/\x{2003}/, + THREE_PER_EM_SPACE => q/\x{2004}/, + FOUR_PER_EM_SPACE => q/\x{2005}/, + SIX_PER_EM_SPACE => q/\x{2006}/, + FIGURE_SPACE => q/\x{2007}/, + PUNCTUATION_SPACE => q/\x{2008}/, + THIN_SPACE => q/\x{2009}/, + HAIR_SPACE => q/\x{200A}/, + MEDIUM_MATHEMATICAL_SPACE => q/\x{205F}/, + IDEOGRAPHIC_SPACE => q/\x{3000}/, + OGHAM_SPACE_MARK => q/\x{1680}/, +); +Readonly my $WHITE_SPACES_RE => (sub { + my $re = '[' . (join '', values %WHITE_SPACES) . ']'; + return qr/$re/; +})->(); +Readonly my %AMBIGUOUS_CHARACTERS => ( + "LATIN CAPITAL LETTER I WITH DOT ABOVE" => q/\x{0130}/, +); + + + +=item normalize_label($label) + +Normalize a single label from a domain name. + +If the label is ASCII only, it is down cased, else it is converted according +to IDNA2008. + +Downcasing of upper case non-ASCII characters, normalization to the Unicode +NFC format and conversion from U-label to A-label is performed by libidn2 +using L. + +Returns a tuple C<($errors: ArrayRef[Zonemaster::Engine::Normalization::Error], $alabel: String)>. + +In case of errors, the returned label will be undefined. If the method +succeeded an empty error array is returned. + +=cut + +sub normalize_label { + my ( $label ) = @_; + my @messages; + + my $alabel = ""; + + if ( $label =~ $VALID_ASCII ) { + $alabel = lc $label; + } elsif ( $label =~ $ASCII ) { + push @messages, Zonemaster::Engine::Normalization::Error->new(INVALID_ASCII => {label => $label}); + + return \@messages, undef; + } elsif ( Zonemaster::LDNS::has_idn ) { + try { + $alabel = Zonemaster::LDNS::to_idn($label); + } catch { + push @messages, Zonemaster::Engine::Normalization::Error->new(INVALID_U_LABEL => {label => $label}); + + return \@messages, undef; + } + } else { + croak 'The domain name contains at least one non-ASCII character and this installation of Zonemaster has no support for IDNA.'; + } + + if ( length($alabel) > 63 ) { + push @messages, Zonemaster::Engine::Normalization::Error->new(LABEL_TOO_LONG => {label => $label}); + return \@messages, undef; + } + + return \@messages, $alabel; +} + +=item normalize_name($name) + +Normalize a domain name. + + +The normalization process is detailed in the L. + +Returns a tuple C<($errors: ArrayRef[Zonemaster::Engine::Normalization::Error], $name: String)>. + +In case of errors, the returned name will be undefined. If the method succeeded +an empty error array is returned. + +=cut + +sub normalize_name { + my ( $uname ) = @_; + my @messages; + + $uname =~ s/^${$WHITE_SPACES_RE}+//; + $uname =~ s/${WHITE_SPACES_RE}+$//; + + if ( length($uname) == 0 ) { + push @messages, Zonemaster::Engine::Normalization::Error->new(EMPTY_DOMAIN_NAME => {}); + return \@messages, undef; + } + + foreach my $char_name ( keys %AMBIGUOUS_CHARACTERS ) { + my $char = $AMBIGUOUS_CHARACTERS{$char_name}; + if ( $uname =~ m/${char}/) { + push @messages, Zonemaster::Engine::Normalization::Error->new(AMBIGUOUS_DOWNCASING => { unicode_name => $char_name }); + } + } + + if ( @messages ) { + return \@messages, undef; + } + + $uname =~ s/${FULL_STOPS_RE}/${ASCII_FULL_STOP}/g; + + if ( $uname eq $ASCII_FULL_STOP ) { + return \@messages, $uname; + } + + if ( $uname =~ m/^${ASCII_FULL_STOP_RE}/ ) { + push @messages, Zonemaster::Engine::Normalization::Error->new(INITIAL_DOT => {}); + return \@messages, undef; + } + + if ( $uname =~ m/${ASCII_FULL_STOP_RE}{2,}/ ) { + push @messages, Zonemaster::Engine::Normalization::Error->new(REPEATED_DOTS => {}); + return \@messages, undef; + } + + $uname =~ s/${ASCII_FULL_STOP_RE}$//g; + + my @labels = split $ASCII_FULL_STOP_RE, $uname; + my @label_results = map { [ normalize_label($_) ] } @labels; + my @label_errors = map { @{$_->[0]} } @label_results; + + push @messages, @label_errors; + + if ( @messages ) { + return \@messages, undef; + } + + my @label_ok = map { $_->[1] } @label_results; + + my $final_name = join '.', @label_ok; + + if ( length($final_name) > 253 ) { + push @messages, Zonemaster::Engine::Normalization::Error->new(DOMAIN_NAME_TOO_LONG => {}); + return \@messages, undef; + } + + return \@messages, $final_name; +} + + +=back +=cut + +1; diff --git a/lib/Zonemaster/Engine/Normalization/Error.pm b/lib/Zonemaster/Engine/Normalization/Error.pm new file mode 100644 index 000000000..ba30fd4c7 --- /dev/null +++ b/lib/Zonemaster/Engine/Normalization/Error.pm @@ -0,0 +1,147 @@ +package Zonemaster::Engine::Normalization::Error; + +use strict; +use warnings; + +use Carp; +use Readonly; +use Locale::TextDomain qw[Zonemaster-Engine]; + +use overload '""' => \&string; + + +=head1 NAME + +Zonemaster::Engine::Normalization::Error - normalization error class + + +=head1 SYNOPSIS + + use Zonemaster::Engine::Normalization::Error; + + my $error = Zonemaster::Engine::Normalization::Error->new(LABEL_TOO_LONG => {label => $label}); + +=cut + + +Readonly my %ERRORS => ( + AMBIGUOUS_DOWNCASING => { + message => N__ 'Ambiguous downcaseing of character "{unicode_name}" in the domain name. Use all lower case instead.', + arguments => [ 'unicode_name' ] + }, + DOMAIN_NAME_TOO_LONG => { + message => N__ 'Domain name is too long (more than 253 characters with no final dot).', + }, + EMPTY_DOMAIN_NAME => { + message => N__ 'Domain name is empty.' + }, + INITIAL_DOT => { + message => N__ 'Domain name starts with dot.' + }, + INVALID_ASCII => { + message => N__ 'Domain name has an ASCII label ("{label}") with a character not permitted.', + arguments => [ 'label' ] + }, + INVALID_U_LABEL => { + message => N__ 'Domain name has a non-ASCII label ("{label}") which is not a valid U-label.', + arguments => [ 'label' ] + }, + REPEATED_DOTS => { + message => N__ 'Domain name has repeated dots.' + }, + LABEL_TOO_LONG => { + message => N__ 'Domain name has a label that is too long (more than 63 characters), "{label}".', + arguments => [ 'label' ] + }, +); + +=head1 ATTRIBUTES + +=over + +=item tag + +The message tag associated to the error. + +=item params + +The error message parameters to use in the message string. + +=back + +=head1 METHODS + +=over + +=item new($tag, $params) + +Creates and returns a new error object. +This function will croak if there is a missing parameter for the given tag. + +=cut + +sub new { + my ( $proto, $tag, $params ) = @_; + my $class = ref $proto || $proto; + + if ( !exists $ERRORS{$tag} ) { + croak 'Unknown error tag.'; + } + + my $obj = { tag => $tag, params => {} }; + + if ( exists $ERRORS{$tag}->{arguments} ) { + foreach my $arg ( @{$ERRORS{$tag}->{arguments}} ) { + if ( !exists $params->{$arg} ) { + croak "Missing arguments $arg."; + } + $obj->{params}->{$arg} = $params->{$arg}; + } + } + + return bless $obj, $class; +} + + +=item message + +Returns the translated error message using the parameters given when creating the object. + +=cut + +sub message { + my ( $self ) = @_; + return __x $ERRORS{$self->{tag}}->{message}, %{$self->{params}}; +} + + +=item tag + +Returns the message tag asscociated to the error. + +=cut + +sub tag { + my ( $self ) = @_; + + return $self->{tag}; +} + +=item string + +Returns a string representation of the error object. Equivalent to message(). + +=cut + +sub string { + my ( $self ) = @_; + + return $self->message; +} + + +=back + +=cut + +1; diff --git a/t/normalization.t b/t/normalization.t new file mode 100644 index 000000000..ee3971210 --- /dev/null +++ b/t/normalization.t @@ -0,0 +1,125 @@ +use Test::More; +use Test::Exception; + +use utf8; + +BEGIN { use_ok( 'Zonemaster::Engine::Normalization' ); } + +subtest 'Valid domains' => sub { + my %input_domains = ( + # Roots + '.' => '.', # Full stop + '.' => '.', # Fullwidth full stop + '。' => '.', # Ideographic full stop + '。' => '.', # Halfwidth ideographic full stop + + # Trailing and leading white spaces + " \x{205F} example.com. \x{0009}" => 'example.com', + + # Mixed dots with trailing dot + 'example。com.' => 'example.com', + 'example。com.' => 'example.com', + 'sub.example.com。' => 'sub.example.com', + 'sub.example.com。' => 'sub.example.com', + + # Mixed dots without trailing dot + 'example。com' => 'example.com', + 'example。com' => 'example.com', + 'sub.example.com' => 'sub.example.com', + 'sub.example.com' => 'sub.example.com', + + # Domains with U-Labels + 'café.example.com' => 'xn--caf-dma.example.com', + 'エグザンプル。example。com' => 'xn--ickqs6k2dyb.example.com', + 'αβγδε.example.com' => 'xn--mxacdef.example.com', + + # Domains with uppercase unicode + 'CafÉ.example.com' => 'xn--caf-dma.example.com', + 'ΑβΓΔε.example.com' => 'xn--mxacdef.example.com', + + # All ascii domains (lowercase) + 'example.com' => 'example.com', + '0/28.2.0.192.example.com' => '0/28.2.0.192.example.com', + '_http._tcp.example.com.' => '_http._tcp.example.com', + 'sub-domain.example.com' => 'sub-domain.example.com', + + # All ascii domains with uppercase characters + 'suB-doMaIN.ExamPlE.cOm' => 'sub-domain.example.com', + + # Single label domains + 'test' => 'test', + 'テスト' => 'xn--zckzah', + + # Length limits + "a" x 63 . ".example.com" => "a" x 63 . ".example.com", + # this is 253 characters + ("a" x 15 . ".") x 15 . "b" . ".example.com" => ("a" x 15 . ".") x 15 . "b" . ".example.com", + + # NFC conversion (for each group first is non-NFC, second is equivalent NFC) + "d\x{006F}\x{0308}d" => 'xn--dd-fka', + 'död' => 'xn--dd-fka', + + "aq\x{0307}\x{0323}a" => 'xn--aqa-9dc3l', + "aq\x{0323}\x{0307}a" => 'xn--aqa-9dc3l', + + "aḋ\x{0323}a" => 'xn--aa-rub587y', + "aḍ\x{0307}a" => 'xn--aa-rub587y', + ); + + while (($domain, $expected_output) = each (%input_domains)) { + subtest "Domain: '$domain'" => sub { + my $errors, $final_domain; + lives_ok(sub { + ($errors, $final_domain) = normalize_name($domain); + }, 'correct domain should live'); + is(scalar @{$errors}, 0, 'No error returned') or diag(@{$errors}); + is($final_domain, $expected_output, 'Match expected domain') or diag($final_domain); + } + } +}; + +subtest 'Bad domains' => sub { + my %input_domains = ( + # Empty labels + '.。.' => 'INITIAL_DOT', + 'example。.com.' => 'REPEATED_DOTS', + 'example。com.。' => 'REPEATED_DOTS', + '..example。com' => 'INITIAL_DOT', + + # Bad ascii + 'bad:%;!$.example.com.' => 'INVALID_ASCII', + + # Label to long + "a" x 64 . ".example.com" => 'LABEL_TOO_LONG', + # Length too long after idn conversion (libidn fails) + 'チョコレート' x 8 . 'a' . '.example.com' => 'INVALID_U_LABEL', + # Emoji in names are invalid as per IDNA2008 + '❤️.example.com' => 'INVALID_U_LABEL', + + # Domain to long + # this is 254 characters + ("a" x 15 . ".") x 15 . "bc" . ".example.com" => 'DOMAIN_NAME_TOO_LONG', + + # Empty domain + '' => 'EMPTY_DOMAIN_NAME', + ' ' => 'EMPTY_DOMAIN_NAME', + + # Ambiguous downcasing + 'İ.example.com' => 'AMBIGUOUS_DOWNCASING', + ); + + while (($domain, $error) = each (%input_domains)) { + subtest "Domain: '$domain' ($error)" => sub { + my $output, $messages, $domain; + lives_ok(sub { + ($errors, $final_domain) = normalize_name($domain); + }, 'incorrect domain should live'); + + is($final_domain, undef, 'No domain returned') or diag($final_domain); + is($errors->[0]->tag, $error, 'Correct error is returned') or diag($errors[0]); + note($errors->[0]) + } + } +}; + +done_testing;