Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement CheckHyphens #484

Merged
merged 6 commits into from
Jul 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ heap_size = ["heapsize"]
[dependencies]
encoding = {version = "0.2", optional = true}
heapsize = {version = ">=0.4.1, <0.5", optional = true}
idna = { version = "0.1.0", path = "./idna" }
idna = { version = "0.2.0", path = "./idna" }
matches = "0.1"
percent-encoding = { version = "1.0.0", path = "./percent_encoding" }
rustc-serialize = {version = "0.3", optional = true}
Expand Down
2 changes: 1 addition & 1 deletion idna/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "idna"
version = "0.1.5"
version = "0.2.0"
authors = ["The rust-url developers"]
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
repository = "https://github.com/servo/rust-url/"
Expand Down
14 changes: 2 additions & 12 deletions idna/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,7 @@ pub mod uts46;
///
/// This process may fail.
pub fn domain_to_ascii(domain: &str) -> Result<String, uts46::Errors> {
uts46::to_ascii(domain, uts46::Flags {
use_std3_ascii_rules: false,
transitional_processing: false,
verify_dns_length: false,
})
uts46::Config::default().to_ascii(domain)
}

/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm.
Expand All @@ -63,11 +59,5 @@ pub fn domain_to_ascii(domain: &str) -> Result<String, uts46::Errors> {
/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation)
/// but always returns a string for the mapped domain.
pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) {
uts46::to_unicode(domain, uts46::Flags {
use_std3_ascii_rules: false,

// Unused:
transitional_processing: false,
verify_dns_length: false,
})
uts46::Config::default().to_unicode(domain)
}
175 changes: 99 additions & 76 deletions idna/src/uts46.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,13 @@ fn find_char(codepoint: char) -> &'static Mapping {
}).unwrap()
}

fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
fn map_char(codepoint: char, config: Config, output: &mut String, errors: &mut Vec<Error>) {
match *find_char(codepoint) {
Mapping::Valid => output.push(codepoint),
Mapping::Ignored => {},
Mapping::Mapped(ref slice) => output.push_str(decode_slice(slice)),
Mapping::Deviation(ref slice) => {
if flags.transitional_processing {
if config.transitional_processing {
output.push_str(decode_slice(slice))
} else {
output.push(codepoint)
Expand All @@ -100,13 +100,13 @@ fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec
output.push(codepoint);
}
Mapping::DisallowedStd3Valid => {
if flags.use_std3_ascii_rules {
if config.use_std3_ascii_rules {
errors.push(Error::DissallowedByStd3AsciiRules);
}
output.push(codepoint)
}
Mapping::DisallowedStd3Mapped(ref slice) => {
if flags.use_std3_ascii_rules {
if config.use_std3_ascii_rules {
errors.push(Error::DissallowedMappedInStd3);
}
output.push_str(decode_slice(slice))
Expand Down Expand Up @@ -233,16 +233,16 @@ fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
}

/// http://www.unicode.org/reports/tr46/#Validity_Criteria
fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
fn validate_full(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>) {
// V1: Must be in NFC form.
if label.nfc().ne(label.chars()) {
errors.push(Error::ValidityCriteria);
} else {
validate(label, is_bidi_domain, flags, errors);
validate(label, is_bidi_domain, config, errors);
}
}

fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
fn validate(label: &str, is_bidi_domain: bool, config: Config, errors: &mut Vec<Error>) {
let first_char = label.chars().next();
if first_char == None {
// Empty string, pass
Expand All @@ -253,11 +253,9 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Er
// NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
// third and fourth positions. But nobody follows this criteria. See the spec issue below:
// https://github.com/whatwg/url/issues/53
//
// TODO: Add *CheckHyphens* flag.

// V3: neither begin nor end with a U+002D HYPHEN-MINUS
else if label.starts_with("-") || label.ends_with("-") {
else if config.check_hyphens && (label.starts_with("-") || label.ends_with("-")) {
errors.push(Error::ValidityCriteria);
}

Expand All @@ -273,8 +271,8 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Er
// V6: Check against Mapping Table
else if label.chars().any(|c| match *find_char(c) {
Mapping::Valid => false,
Mapping::Deviation(_) => flags.transitional_processing,
Mapping::DisallowedStd3Valid => flags.use_std3_ascii_rules,
Mapping::Deviation(_) => config.transitional_processing,
Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
_ => true,
}) {
errors.push(Error::ValidityCriteria);
Expand All @@ -294,10 +292,10 @@ fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Er
}

/// http://www.unicode.org/reports/tr46/#Processing
fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
fn processing(domain: &str, config: Config, errors: &mut Vec<Error>) -> String {
let mut mapped = String::with_capacity(domain.len());
for c in domain.chars() {
map_char(c, flags, &mut mapped, errors)
map_char(c, config, &mut mapped, errors)
}
let mut normalized = String::with_capacity(mapped.len());
normalized.extend(mapped.nfc());
Expand Down Expand Up @@ -338,26 +336,105 @@ fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
if label.starts_with(PUNYCODE_PREFIX) {
match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
Some(decoded_label) => {
let flags = Flags { transitional_processing: false, ..flags };
validate_full(&decoded_label, is_bidi_domain, flags, errors);
let config = config.transitional_processing(false);
validate_full(&decoded_label, is_bidi_domain, config, errors);
validated.push_str(&decoded_label)
}
None => errors.push(Error::PunycodeError)
}
} else {
// `normalized` is already `NFC` so we can skip that check
validate(label, is_bidi_domain, flags, errors);
validate(label, is_bidi_domain, config, errors);
validated.push_str(label)
}
}
validated
}

#[derive(Copy, Clone)]
pub struct Flags {
pub use_std3_ascii_rules: bool,
pub transitional_processing: bool,
pub verify_dns_length: bool,
#[derive(Clone, Copy, Default)]
pub struct Config {
use_std3_ascii_rules: bool,
transitional_processing: bool,
verify_dns_length: bool,
check_hyphens: bool,
}

impl Config {
#[inline]
pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
self.use_std3_ascii_rules = value;
self
}

#[inline]
pub fn transitional_processing(mut self, value: bool) -> Self {
self.transitional_processing = value;
self
}

#[inline]
pub fn verify_dns_length(mut self, value: bool) -> Self {
self.verify_dns_length = value;
self
}

#[inline]
pub fn check_hyphens(mut self, value: bool) -> Self {
self.check_hyphens = value;
self
}

/// http://www.unicode.org/reports/tr46/#ToASCII
pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
let mut errors = Vec::new();
let mut result = String::new();
let mut first = true;
for label in processing(domain, self, &mut errors).split('.') {
if !first {
result.push('.');
}
first = false;
if label.is_ascii() {
result.push_str(label);
} else {
match punycode::encode_str(label) {
Some(x) => {
result.push_str(PUNYCODE_PREFIX);
result.push_str(&x);
},
None => errors.push(Error::PunycodeError)
}
}
}

if self.verify_dns_length {
let domain = if result.ends_with(".") { &result[..result.len()-1] } else { &*result };
if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) {
errors.push(Error::TooShortForDns)
}
if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
errors.push(Error::TooLongForDns)
}
}
if errors.is_empty() {
Ok(result)
} else {
Err(Errors(errors))
}
}

/// http://www.unicode.org/reports/tr46/#ToUnicode
pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
let mut errors = Vec::new();
let domain = processing(domain, self, &mut errors);
let errors = if errors.is_empty() {
Ok(())
} else {
Err(Errors(errors))
};
(domain, errors)
}

}

#[derive(PartialEq, Eq, Clone, Copy, Debug)]
Expand All @@ -377,57 +454,3 @@ enum Error {
/// More details may be exposed in the future.
#[derive(Debug)]
pub struct Errors(Vec<Error>);

/// http://www.unicode.org/reports/tr46/#ToASCII
pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> {
let mut errors = Vec::new();
let mut result = String::new();
let mut first = true;
for label in processing(domain, flags, &mut errors).split('.') {
if !first {
result.push('.');
}
first = false;
if label.is_ascii() {
result.push_str(label);
} else {
match punycode::encode_str(label) {
Some(x) => {
result.push_str(PUNYCODE_PREFIX);
result.push_str(&x);
},
None => errors.push(Error::PunycodeError)
}
}
}

if flags.verify_dns_length {
let domain = if result.ends_with(".") { &result[..result.len()-1] } else { &*result };
if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) {
errors.push(Error::TooShortForDns)
}
if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
errors.push(Error::TooLongForDns)
}
}
if errors.is_empty() {
Ok(result)
} else {
Err(Errors(errors))
}
}

/// http://www.unicode.org/reports/tr46/#ToUnicode
///
/// Only `use_std3_ascii_rules` is used in `flags`.
pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) {
flags.transitional_processing = false;
let mut errors = Vec::new();
let domain = processing(domain, flags, &mut errors);
let errors = if errors.is_empty() {
Ok(())
} else {
Err(Errors(errors))
};
(domain, errors)
}