-
Notifications
You must be signed in to change notification settings - Fork 384
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added host validation for urls #983
Changes from 8 commits
a39e073
843760a
f78ac49
4f6dcc2
b8fed2e
5a6ae74
2067992
331d152
454b285
7a3e497
ea52b1e
aca760d
3ced093
790dec8
8e6698b
a04782a
3604c6e
0d9d50a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -699,6 +699,19 @@ private function validate_attr_spec_list_for_node( $node, $attr_spec_list ) { | |
} | ||
} | ||
|
||
/* | ||
* If given attribute's value is a URL with a host, the host must | ||
* be valid | ||
*/ | ||
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) ) { | ||
$result = $this->check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ); | ||
if ( AMP_Rule_Spec::PASS === $result ) { | ||
$score++; | ||
} elseif ( AMP_Rule_Spec::FAIL === $result ) { | ||
return 0; | ||
} | ||
} | ||
|
||
/* | ||
* If the given attribute's value is *not* a relative path, and the rule's | ||
* value is `false`, then pass. | ||
|
@@ -877,6 +890,9 @@ private function delegated_sanitize_disallowed_attribute_values_in_node( $node, | |
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) && | ||
AMP_Rule_Spec::FAIL === $this->check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr_spec_rule ) ) { | ||
$should_remove_node = true; | ||
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) && | ||
AMP_Rule_Spec::FAIL === $this->check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ) ) { | ||
$should_remove_node = true; | ||
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOW_RELATIVE ] ) && | ||
AMP_Rule_Spec::FAIL === $this->check_attr_spec_rule_disallowed_relative( $node, $attr_name, $attr_spec_rule ) ) { | ||
$should_remove_node = true; | ||
|
@@ -1122,6 +1138,49 @@ private function check_attr_spec_rule_value_regex_casei( $node, $attr_name, $att | |
return AMP_Rule_Spec::NOT_APPLICABLE; | ||
} | ||
|
||
/** | ||
* Check if attribute has a valid host value | ||
* | ||
* @since 0.7 | ||
* | ||
* @param DOMElement $node Node. | ||
* @param string $attr_name Attribute name. | ||
* @param array[]|string[] $attr_spec_rule Attribute spec rule. | ||
* | ||
* @return string: | ||
* - AMP_Rule_Spec::PASS - $attr_name has a value that matches the rule. | ||
* - AMP_Rule_Spec::FAIL - $attr_name has a value that does *not* match rule. | ||
* - AMP_Rule_Spec::NOT_APPLICABLE - $attr_name does not exist or there | ||
* is no rule for this attribute. | ||
*/ | ||
private function check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ) { | ||
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) ) { | ||
if ( $node->hasAttribute( $attr_name ) ) { | ||
$attr_value = $node->getAttribute( $attr_name ); | ||
$attr_value = preg_replace( '/\s*,\s*/', ',', $attr_value ); | ||
$urls_to_test = explode( ',', $attr_value ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious about the comma-separated list of URLs. Naturally this shouldn't be allowed in an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can also combine these two lines into: $urls_to_test = preg_split( '/\s*,\s*/', $attr_value ); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. an image with a srcset attribute would have a list of comma separated urls, so i thought it could be a good idea to check all of them |
||
|
||
foreach ( $urls_to_test as $url ) { | ||
// Check if the host contains invalid chars. | ||
$url_host = wp_parse_url( urldecode( $url ), PHP_URL_HOST ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good use of |
||
if ( $url_host && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', $url_host ) ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should think that a whitelist should be used rather than a blacklist here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tend to think like you, but the amp validator is doing it this way, looking for blacklisted chars, so i thought it would be better to follow the same approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is supposed to match everything matched by |
||
return AMP_Rule_Spec::FAIL; | ||
} | ||
|
||
// Check if the protocol contains invalid chars. | ||
$url_scheme = AMP_WP_Utils::parse_url( $url, PHP_URL_SCHEME ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
if ( $url_scheme && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', $url_scheme ) ) { | ||
return AMP_Rule_Spec::FAIL; | ||
} | ||
} | ||
|
||
return AMP_Rule_Spec::PASS; | ||
} | ||
} | ||
|
||
return AMP_Rule_Spec::NOT_APPLICABLE; | ||
} | ||
|
||
/** | ||
* Check if attribute has a protocol value rule determine if it matches. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -680,6 +680,15 @@ public function get_body_data() { | |
'<a class="foo" href="">value</a>', | ||
), | ||
|
||
'a_with_wrong_host' => array( | ||
'<a class="foo" href="http://foo bar">value</a>', | ||
'<a class="foo" href="">value</a>', | ||
), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably should add a test here for scheme-less URLs, like: |
||
'a_with_mail_host' => array( | ||
'<a class="foo" href="mail to:[email protected]">value</a>', | ||
'<a class="foo" href="">value</a>', | ||
), | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add test cases for examples found in #977 (comment) |
||
// font is removed so we should check that other elements are checked as well. | ||
'font_with_other_bad_elements' => array( | ||
'<font size="1">Headline</font><span style="color: blue">Span</span>', | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a
@since 0.7
here