-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1040 from blacksponge/implement-new-basic00
add new basic00 implementation
- Loading branch information
Showing
4 changed files
with
489 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
package Zonemaster::Engine::Normalization; | ||
|
||
use 5.014002; | ||
|
||
use utf8; | ||
use strict; | ||
use warnings; | ||
|
||
use parent 'Exporter'; | ||
|
||
use Carp; | ||
use Encode; | ||
use Readonly; | ||
use Try::Tiny; | ||
use Zonemaster::LDNS; | ||
|
||
use Zonemaster::Engine::Normalization::Error; | ||
|
||
|
||
=head1 NAME | ||
Zonemaster::Engine::Normalization - utility functions for names normalization | ||
=head1 SYNOPSIS | ||
use Zonemaster::Engine::Normalization; | ||
my ($errors, $final_domain) = normalize_name($domain); | ||
=head1 EXPORTED FUNCTIONS | ||
=over | ||
=cut | ||
|
||
|
||
our @EXPORT = qw[ normalize_name ]; | ||
our @EXPORT_OK = qw[ normalize_name normalize_label ]; | ||
|
||
Readonly my $ASCII => qr/^[[:ascii:]]+$/; | ||
Readonly my $VALID_ASCII => qr(^[A-Za-z0-9/_-]+$); | ||
|
||
Readonly my $ASCII_FULL_STOP => "\x{002E}"; | ||
Readonly my $ASCII_FULL_STOP_RE => qr/\x{002E}/; | ||
Readonly my %FULL_STOPS => ( | ||
FULLWIDTH_FULL_STOP => q/\x{FF0E}/, | ||
IDEOGRAPHIC_FULL_STOP => q/\x{3002}/, | ||
HALFWIDTH_IDEOGRAPHIC_FULL_STOP => q/\x{FF61}/ | ||
); | ||
Readonly my $FULL_STOPS_RE => (sub { | ||
my $re = '[' . (join '', values %FULL_STOPS) . ']'; | ||
return qr/$re/; | ||
})->(); | ||
Readonly my %WHITE_SPACES => ( | ||
SPACE => q/\x{0020}/, | ||
CHARACTER_TABULATION => q/\x{0009}/, | ||
NO_BREAK_SPACE => q/\x{00A0}/, | ||
EN_QUAD => q/\x{2000}/, | ||
EM_QUAD => q/\x{2001}/, | ||
EN_SPACE => q/\x{2002}/, | ||
EM_SPACE => q/\x{2003}/, | ||
THREE_PER_EM_SPACE => q/\x{2004}/, | ||
FOUR_PER_EM_SPACE => q/\x{2005}/, | ||
SIX_PER_EM_SPACE => q/\x{2006}/, | ||
FIGURE_SPACE => q/\x{2007}/, | ||
PUNCTUATION_SPACE => q/\x{2008}/, | ||
THIN_SPACE => q/\x{2009}/, | ||
HAIR_SPACE => q/\x{200A}/, | ||
MEDIUM_MATHEMATICAL_SPACE => q/\x{205F}/, | ||
IDEOGRAPHIC_SPACE => q/\x{3000}/, | ||
OGHAM_SPACE_MARK => q/\x{1680}/, | ||
); | ||
Readonly my $WHITE_SPACES_RE => (sub { | ||
my $re = '[' . (join '', values %WHITE_SPACES) . ']'; | ||
return qr/$re/; | ||
})->(); | ||
Readonly my %AMBIGUOUS_CHARACTERS => ( | ||
"LATIN CAPITAL LETTER I WITH DOT ABOVE" => q/\x{0130}/, | ||
); | ||
|
||
|
||
|
||
=item normalize_label($label) | ||
Normalize a single label from a domain name. | ||
If the label is ASCII only, it is down cased, else it is converted according | ||
to IDNA2008. | ||
Downcasing of upper case non-ASCII characters, normalization to the Unicode | ||
NFC format and conversion from U-label to A-label is performed by libidn2 | ||
using L<Zonemaster::LDNS/to_idn($name, ...)>. | ||
Returns a tuple C<($errors: ArrayRef[Zonemaster::Engine::Normalization::Error], $alabel: String)>. | ||
In case of errors, the returned label will be undefined. If the method | ||
succeeded an empty error array is returned. | ||
=cut | ||
|
||
sub normalize_label { | ||
my ( $label ) = @_; | ||
my @messages; | ||
|
||
my $alabel = ""; | ||
|
||
if ( $label =~ $VALID_ASCII ) { | ||
$alabel = lc $label; | ||
} elsif ( $label =~ $ASCII ) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(INVALID_ASCII => {label => $label}); | ||
|
||
return \@messages, undef; | ||
} elsif ( Zonemaster::LDNS::has_idn ) { | ||
try { | ||
$alabel = Zonemaster::LDNS::to_idn($label); | ||
} catch { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(INVALID_U_LABEL => {label => $label}); | ||
|
||
return \@messages, undef; | ||
} | ||
} else { | ||
croak 'The domain name contains at least one non-ASCII character and this installation of Zonemaster has no support for IDNA.'; | ||
} | ||
|
||
if ( length($alabel) > 63 ) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(LABEL_TOO_LONG => {label => $label}); | ||
return \@messages, undef; | ||
} | ||
|
||
return \@messages, $alabel; | ||
} | ||
|
||
=item normalize_name($name) | ||
Normalize a domain name. | ||
The normalization process is detailed in the L<normalization document|https://github.com/zonemaster/zonemaster/blob/master/docs/specifications/tests/RequirementsAndNormalizationOfDomainNames.md>. | ||
Returns a tuple C<($errors: ArrayRef[Zonemaster::Engine::Normalization::Error], $name: String)>. | ||
In case of errors, the returned name will be undefined. If the method succeeded | ||
an empty error array is returned. | ||
=cut | ||
|
||
sub normalize_name { | ||
my ( $uname ) = @_; | ||
my @messages; | ||
|
||
$uname =~ s/^${$WHITE_SPACES_RE}+//; | ||
$uname =~ s/${WHITE_SPACES_RE}+$//; | ||
|
||
if ( length($uname) == 0 ) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(EMPTY_DOMAIN_NAME => {}); | ||
return \@messages, undef; | ||
} | ||
|
||
foreach my $char_name ( keys %AMBIGUOUS_CHARACTERS ) { | ||
my $char = $AMBIGUOUS_CHARACTERS{$char_name}; | ||
if ( $uname =~ m/${char}/) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(AMBIGUOUS_DOWNCASING => { unicode_name => $char_name }); | ||
} | ||
} | ||
|
||
if ( @messages ) { | ||
return \@messages, undef; | ||
} | ||
|
||
$uname =~ s/${FULL_STOPS_RE}/${ASCII_FULL_STOP}/g; | ||
|
||
if ( $uname eq $ASCII_FULL_STOP ) { | ||
return \@messages, $uname; | ||
} | ||
|
||
if ( $uname =~ m/^${ASCII_FULL_STOP_RE}/ ) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(INITIAL_DOT => {}); | ||
return \@messages, undef; | ||
} | ||
|
||
if ( $uname =~ m/${ASCII_FULL_STOP_RE}{2,}/ ) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(REPEATED_DOTS => {}); | ||
return \@messages, undef; | ||
} | ||
|
||
$uname =~ s/${ASCII_FULL_STOP_RE}$//g; | ||
|
||
my @labels = split $ASCII_FULL_STOP_RE, $uname; | ||
my @label_results = map { [ normalize_label($_) ] } @labels; | ||
my @label_errors = map { @{$_->[0]} } @label_results; | ||
|
||
push @messages, @label_errors; | ||
|
||
if ( @messages ) { | ||
return \@messages, undef; | ||
} | ||
|
||
my @label_ok = map { $_->[1] } @label_results; | ||
|
||
my $final_name = join '.', @label_ok; | ||
|
||
if ( length($final_name) > 253 ) { | ||
push @messages, Zonemaster::Engine::Normalization::Error->new(DOMAIN_NAME_TOO_LONG => {}); | ||
return \@messages, undef; | ||
} | ||
|
||
return \@messages, $final_name; | ||
} | ||
|
||
|
||
=back | ||
=cut | ||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
package Zonemaster::Engine::Normalization::Error; | ||
|
||
use strict; | ||
use warnings; | ||
|
||
use Carp; | ||
use Readonly; | ||
use Locale::TextDomain qw[Zonemaster-Engine]; | ||
|
||
use overload '""' => \&string; | ||
|
||
|
||
=head1 NAME | ||
Zonemaster::Engine::Normalization::Error - normalization error class | ||
=head1 SYNOPSIS | ||
use Zonemaster::Engine::Normalization::Error; | ||
my $error = Zonemaster::Engine::Normalization::Error->new(LABEL_TOO_LONG => {label => $label}); | ||
=cut | ||
|
||
|
||
Readonly my %ERRORS => ( | ||
AMBIGUOUS_DOWNCASING => { | ||
message => N__ 'Ambiguous downcaseing of character "{unicode_name}" in the domain name. Use all lower case instead.', | ||
arguments => [ 'unicode_name' ] | ||
}, | ||
DOMAIN_NAME_TOO_LONG => { | ||
message => N__ 'Domain name is too long (more than 253 characters with no final dot).', | ||
}, | ||
EMPTY_DOMAIN_NAME => { | ||
message => N__ 'Domain name is empty.' | ||
}, | ||
INITIAL_DOT => { | ||
message => N__ 'Domain name starts with dot.' | ||
}, | ||
INVALID_ASCII => { | ||
message => N__ 'Domain name has an ASCII label ("{label}") with a character not permitted.', | ||
arguments => [ 'label' ] | ||
}, | ||
INVALID_U_LABEL => { | ||
message => N__ 'Domain name has a non-ASCII label ("{label}") which is not a valid U-label.', | ||
arguments => [ 'label' ] | ||
}, | ||
REPEATED_DOTS => { | ||
message => N__ 'Domain name has repeated dots.' | ||
}, | ||
LABEL_TOO_LONG => { | ||
message => N__ 'Domain name has a label that is too long (more than 63 characters), "{label}".', | ||
arguments => [ 'label' ] | ||
}, | ||
); | ||
|
||
=head1 ATTRIBUTES | ||
=over | ||
=item tag | ||
The message tag associated to the error. | ||
=item params | ||
The error message parameters to use in the message string. | ||
=back | ||
=head1 METHODS | ||
=over | ||
=item new($tag, $params) | ||
Creates and returns a new error object. | ||
This function will croak if there is a missing parameter for the given tag. | ||
=cut | ||
|
||
sub new { | ||
my ( $proto, $tag, $params ) = @_; | ||
my $class = ref $proto || $proto; | ||
|
||
if ( !exists $ERRORS{$tag} ) { | ||
croak 'Unknown error tag.'; | ||
} | ||
|
||
my $obj = { tag => $tag, params => {} }; | ||
|
||
if ( exists $ERRORS{$tag}->{arguments} ) { | ||
foreach my $arg ( @{$ERRORS{$tag}->{arguments}} ) { | ||
if ( !exists $params->{$arg} ) { | ||
croak "Missing arguments $arg."; | ||
} | ||
$obj->{params}->{$arg} = $params->{$arg}; | ||
} | ||
} | ||
|
||
return bless $obj, $class; | ||
} | ||
|
||
|
||
=item message | ||
Returns the translated error message using the parameters given when creating the object. | ||
=cut | ||
|
||
sub message { | ||
my ( $self ) = @_; | ||
return __x $ERRORS{$self->{tag}}->{message}, %{$self->{params}}; | ||
} | ||
|
||
|
||
=item tag | ||
Returns the message tag asscociated to the error. | ||
=cut | ||
|
||
sub tag { | ||
my ( $self ) = @_; | ||
|
||
return $self->{tag}; | ||
} | ||
|
||
=item string | ||
Returns a string representation of the error object. Equivalent to message(). | ||
=cut | ||
|
||
sub string { | ||
my ( $self ) = @_; | ||
|
||
return $self->message; | ||
} | ||
|
||
|
||
=back | ||
=cut | ||
|
||
1; |
Oops, something went wrong.