From fe3acd0f822463a151d560db49811fc3f1b96414 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 22 Jan 2020 23:14:48 -0600 Subject: [PATCH 1/9] Allow a URL to have a leading space without a validation error As Weston found, something like: would have a validation error. --- .../class-amp-tag-and-attribute-sanitizer.php | 6 +- ...nd-attribute-sanitizer-private-methods.php | 90 +++++++++++++++++-- 2 files changed, 86 insertions(+), 10 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index b635d83e4aa..4cbc52f1581 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1626,7 +1626,7 @@ private function check_attr_spec_rule_value_regex_casei( DOMElement $node, $attr private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $attr_spec_rule ) { if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) && $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url = urldecode( $url ); + $url = urldecode( trim( $url ) ); // Check whether the URL is parsable. $parts = wp_parse_url( $url ); @@ -1689,7 +1689,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) ) { if ( $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url_scheme = $this->parse_protocol( $url ); + $url_scheme = $this->parse_protocol( trim( $url ) ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } @@ -1701,7 +1701,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) { if ( $node->hasAttribute( $alternative_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $alternative_name ), $attr_name ) as $url ) { - $url_scheme = $this->parse_protocol( $url ); + $url_scheme = $this->parse_protocol( trim( $url ) ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } diff --git a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php index 79e90d603d9..95b9472abd4 100644 --- a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php +++ b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php @@ -1413,6 +1413,66 @@ public function get_check_attr_spec_rule_blacklisted_value_regex() { ]; } + /** + * Gets the test data for test_check_attr_spec_rule_valid_url(). + * + * @return array The test data. + */ + public function get_check_attr_spec_rule_valid_url() { + return [ + 'no_attribute' => [ + '', + AMP_Rule_Spec::NOT_APPLICABLE, + ], + 'correct_url' => [ + '', + AMP_Rule_Spec::PASS, + ], + 'correct_url_leading_space' => [ + '', + AMP_Rule_Spec::PASS, + ], + 'non_parseable_url' => [ + '', + AMP_Rule_Spec::FAIL, + ], + 'wrong_protocol' => [ + '', + AMP_Rule_Spec::FAIL, + ], + 'wrong_host' => [ + '', + AMP_Rule_Spec::FAIL, + ], + ]; + } + + /** + * Tests check_attr_spec_rule_valid_url. + * + * @dataProvider get_check_attr_spec_rule_valid_url + * @group allowed-tags-private-methods + * @covers AMP_Tag_And_Attribute_Sanitizer::check_attr_spec_rule_valid_url() + * + * @param array $source The HTML source to test. + * @param string $expected The expected return value. + * @throws ReflectionException If it's not possible to create a reflection to call the private method. + */ + public function test_check_attr_spec_rule_valid_url( $source, $expected ) { + $node_tag_name = 'a'; + $dom = AMP_DOM_Utils::get_dom_from_content( $source ); + $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom ); + $node = $dom->getElementsByTagName( $node_tag_name )->item( 0 ); + $attr_name = 'baz'; + $attr_spec_rule = [ 'value_url' => [] ]; + + $this->assertEquals( + $expected, + $this->call_private_method( $sanitizer, 'check_attr_spec_rule_valid_url', [ $node, $attr_name, $attr_spec_rule ] ), + sprintf( 'using source: %s', $source ) + ); + } + /** * @dataProvider get_check_attr_spec_rule_blacklisted_value_regex * @group allowed-tags-private-methods @@ -1429,7 +1489,7 @@ public function test_check_attr_spec_rule_blacklisted_value_regex( $data, $expec public function get_check_attr_spec_rule_allowed_protocol() { return [ - 'no_attributes' => [ + 'no_attributes' => [ [ 'source' => '
', 'node_tag_name' => 'div', @@ -1438,7 +1498,7 @@ public function get_check_attr_spec_rule_allowed_protocol() { ], AMP_Rule_Spec::NOT_APPLICABLE, ], - 'protocol_pass' => [ + 'protocol_pass' => [ [ 'source' => '
', 'node_tag_name' => 'div', @@ -1454,7 +1514,23 @@ public function get_check_attr_spec_rule_allowed_protocol() { ], AMP_Rule_Spec::PASS, ], - 'protocol_multiple_pass' => [ + 'protocol_pass_leading_space' => [ + [ + 'source' => '
', + 'node_tag_name' => 'div', + 'attr_name' => 'attribute1', + 'attr_spec_rule' => [ + 'value_url' => [ + 'protocol' => [ + 'http', + 'https', + ], + ], + ], + ], + AMP_Rule_Spec::PASS, + ], + 'protocol_multiple_pass' => [ [ 'source' => '
', 'node_tag_name' => 'div', @@ -1470,7 +1546,7 @@ public function get_check_attr_spec_rule_allowed_protocol() { ], AMP_Rule_Spec::PASS, ], - 'protocol_fail' => [ + 'protocol_fail' => [ [ 'source' => '
', 'node_tag_name' => 'div', @@ -1486,7 +1562,7 @@ public function get_check_attr_spec_rule_allowed_protocol() { ], AMP_Rule_Spec::FAIL, ], - 'protocol_multiple_fail' => [ + 'protocol_multiple_fail' => [ [ 'source' => '', 'node_tag_name' => 'img', @@ -1502,7 +1578,7 @@ public function get_check_attr_spec_rule_allowed_protocol() { ], AMP_Rule_Spec::FAIL, ], - 'protocol_alternative_pass' => [ + 'protocol_alternative_pass' => [ [ 'source' => '
', 'node_tag_name' => 'div', @@ -1521,7 +1597,7 @@ public function get_check_attr_spec_rule_allowed_protocol() { ], AMP_Rule_Spec::PASS, ], - 'protocol_alternative_fail' => [ + 'protocol_alternative_fail' => [ [ 'source' => '
', 'node_tag_name' => 'div', From 9bd24e6cd6ddeba83b38714b90e573b3465d5b89 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 00:47:48 -0600 Subject: [PATCH 2/9] Replace 2 calls of trim() with longer regex in parse_protocol() Those trim( $url ) calls were passed to parse_protocol(). So do something similar inside parse_protocol(), don't match leading space. --- .../class-amp-tag-and-attribute-sanitizer.php | 8 +-- ...nd-attribute-sanitizer-private-methods.php | 55 +++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index 4cbc52f1581..521dabbd024 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1666,8 +1666,8 @@ private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $ * @return string|null Protocol without colon if matched. Otherwise null. */ private function parse_protocol( $url ) { - if ( preg_match( '#^[^/]+(?=:)#', $url, $matches ) ) { - return $matches[0]; + if ( preg_match( '#^\s*([^/]+)(?=:)#', $url, $matches ) ) { + return $matches[1]; } return null; } @@ -1689,7 +1689,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) ) { if ( $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url_scheme = $this->parse_protocol( trim( $url ) ); + $url_scheme = $this->parse_protocol( $url ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } @@ -1701,7 +1701,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) { if ( $node->hasAttribute( $alternative_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $alternative_name ), $attr_name ) as $url ) { - $url_scheme = $this->parse_protocol( trim( $url ) ); + $url_scheme = $this->parse_protocol( $url ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } diff --git a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php index 95b9472abd4..cbda151ac9c 100644 --- a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php +++ b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php @@ -1619,6 +1619,61 @@ public function get_check_attr_spec_rule_allowed_protocol() { ]; } + /** + * Gets the test data for test_parse_protocol(). + * + * @return array The test data. + */ + public function get_parse_protocol_data() { + return [ + 'empty_string' => [ + '', + false, + ], + 'only_space' => [ + ' ', + false, + ], + 'traditional_https' => [ + 'https://example.com', + 'https', + ], + 'leading_space' => [ + ' https://foo', + 'https', + ], + 'trailing_space' => [ + 'https://foo.com ', + 'https', + ], + 'no_colon' => [ + '//image.png ', + false, + ], + ]; + } + + /** + * Tests parse_protocol. + * + * @dataProvider get_parse_protocol_data + * @group allowed-tags-private-methods + * @covers AMP_Tag_And_Attribute_Sanitizer::parse_protocol() + * + * @param array $url The URL to parse. + * @param string $expected The expected return value. + * @throws ReflectionException If it's not possible to create a reflection to call the private method. + */ + public function test_parse_protocol( $url, $expected ) { + $dom = AMP_DOM_Utils::get_dom_from_content( '' ); + $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom ); + + $this->assertEquals( + $expected, + $this->call_private_method( $sanitizer, 'parse_protocol', [ $url ] ) + ); + } + /** * @dataProvider get_check_attr_spec_rule_allowed_protocol * @group allowed-tags-private-methods From a01b51f2f1b388e1608d9eb30b04bbbfac610af6 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 01:16:32 -0600 Subject: [PATCH 3/9] As there's now a capturing group, remove lookahead There is a matching group of ([^/]), so there probably isn't a need for the lookahead. --- includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php | 2 +- .../test-amp-tag-and-attribute-sanitizer-private-methods.php | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index 521dabbd024..d9dd93ac84f 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1666,7 +1666,7 @@ private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $ * @return string|null Protocol without colon if matched. Otherwise null. */ private function parse_protocol( $url ) { - if ( preg_match( '#^\s*([^/]+)(?=:)#', $url, $matches ) ) { + if ( preg_match( '#^\s*([^/]+):#', $url, $matches ) ) { return $matches[1]; } return null; diff --git a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php index cbda151ac9c..4d292a4b856 100644 --- a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php +++ b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php @@ -1650,6 +1650,10 @@ public function get_parse_protocol_data() { '//image.png ', false, ], + 'two_colons' => [ + 'foo:baz://image.png ', + 'foo:baz', + ], ]; } From 761981e1836126d2591675acea4c0b1b296e18c7 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 11:24:43 -0600 Subject: [PATCH 4/9] Revert change to regex, in favor of trimming before it's passed Instead of changing this, simply trim() the URL before passing it to parse_protocol() --- .../sanitizers/class-amp-tag-and-attribute-sanitizer.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index d9dd93ac84f..4cbc52f1581 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1666,8 +1666,8 @@ private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $ * @return string|null Protocol without colon if matched. Otherwise null. */ private function parse_protocol( $url ) { - if ( preg_match( '#^\s*([^/]+):#', $url, $matches ) ) { - return $matches[1]; + if ( preg_match( '#^[^/]+(?=:)#', $url, $matches ) ) { + return $matches[0]; } return null; } @@ -1689,7 +1689,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) ) { if ( $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url_scheme = $this->parse_protocol( $url ); + $url_scheme = $this->parse_protocol( trim( $url ) ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } @@ -1701,7 +1701,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) { if ( $node->hasAttribute( $alternative_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $alternative_name ), $attr_name ) as $url ) { - $url_scheme = $this->parse_protocol( $url ); + $url_scheme = $this->parse_protocol( trim( $url ) ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } From bd4f45f17e70859845bc28d3496f8abb6642ffb3 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 11:34:03 -0600 Subject: [PATCH 5/9] Move urldecode() lower in the function As Weston mentioned, there might not be a need for it to run earlier. --- includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index 4cbc52f1581..c73e2fc36bc 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1626,7 +1626,7 @@ private function check_attr_spec_rule_value_regex_casei( DOMElement $node, $attr private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $attr_spec_rule ) { if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) && $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url = urldecode( trim( $url ) ); + $url = trim( $url ); // Check whether the URL is parsable. $parts = wp_parse_url( $url ); @@ -1644,7 +1644,7 @@ private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $ } // Check if the host contains invalid chars (hostCharIsValid: https://github.com/ampproject/amphtml/blob/af1e3a550feeafd732226202b8d1f26dcefefa18/validator/engine/parse-url.js#L62-L103). - $host = wp_parse_url( $url, PHP_URL_HOST ); + $host = wp_parse_url( urldecode( $url ), PHP_URL_HOST ); if ( $host && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/', $host ) ) { return AMP_Rule_Spec::FAIL; } From c703052bf557b95d87b3ed8eec41218cf751fe3a Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 11:59:27 -0600 Subject: [PATCH 6/9] Remove test case that is no longer handled parse_protocol() no longer trims space on its own. --- .../test-amp-tag-and-attribute-sanitizer-private-methods.php | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php index 4d292a4b856..c25a543f7bb 100644 --- a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php +++ b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php @@ -1638,10 +1638,6 @@ public function get_parse_protocol_data() { 'https://example.com', 'https', ], - 'leading_space' => [ - ' https://foo', - 'https', - ], 'trailing_space' => [ 'https://foo.com ', 'https', From 911602d04c8fff4f73a382ba9ac0c600e0bdfe1b Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 19:47:56 -0600 Subject: [PATCH 7/9] Add a new method to normalize URLs Use this instead of trim() for the URLs in attributes. --- .../class-amp-tag-and-attribute-sanitizer.php | 16 +++- ...nd-attribute-sanitizer-private-methods.php | 81 +++++++++++++++++++ .../php/test-tag-and-attribute-sanitizer.php | 22 +++-- 3 files changed, 103 insertions(+), 16 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index c73e2fc36bc..a481db5d81b 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1626,7 +1626,7 @@ private function check_attr_spec_rule_value_regex_casei( DOMElement $node, $attr private function check_attr_spec_rule_valid_url( DOMElement $node, $attr_name, $attr_spec_rule ) { if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) && $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url = trim( $url ); + $url = $this->normalize_url_from_attribute_value( $url ); // Check whether the URL is parsable. $parts = wp_parse_url( $url ); @@ -1672,6 +1672,16 @@ private function parse_protocol( $url ) { return null; } + /** + * Normalize a URL that appeared as a tag attribute. + * + * @param string $url The URL to normalize. + * @return string The normalized URL. + */ + private function normalize_url_from_attribute_value( $url ) { + return preg_replace( '/\s/', '', $url ); + } + /** * Check if attribute has a protocol value rule determine if it matches. * @@ -1689,7 +1699,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) ) { if ( $node->hasAttribute( $attr_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $attr_name ) ) as $url ) { - $url_scheme = $this->parse_protocol( trim( $url ) ); + $url_scheme = $this->parse_protocol( $this->normalize_url_from_attribute_value( $url ) ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } @@ -1701,7 +1711,7 @@ private function check_attr_spec_rule_allowed_protocol( DOMElement $node, $attr_ foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) { if ( $node->hasAttribute( $alternative_name ) ) { foreach ( $this->extract_attribute_urls( $node->getAttributeNode( $alternative_name ), $attr_name ) as $url ) { - $url_scheme = $this->parse_protocol( trim( $url ) ); + $url_scheme = $this->parse_protocol( $this->normalize_url_from_attribute_value( $url ) ); if ( isset( $url_scheme ) && ! in_array( strtolower( $url_scheme ), $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ], true ) ) { return AMP_Rule_Spec::FAIL; } diff --git a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php index c25a543f7bb..2b3b294c02b 100644 --- a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php +++ b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php @@ -1674,6 +1674,87 @@ public function test_parse_protocol( $url, $expected ) { ); } + /** + * Gets the test data for test_normalize_url_from_attribute_value(). + * + * @return array The test data. + */ + public function get_normalize_url_data() { + $normalized_url = 'https://example.com'; + + return [ + 'nothing_to_remove' => [ + 'https://example.com', + ], + 'empty_string' => [ + '', + ], + 'only_space' => [ + ' ', + '', + ], + 'leading_space' => [ + ' https://example.com', + $normalized_url, + ], + 'leading_tab' => [ + "\thttps://example.com", + $normalized_url, + ], + 'trailing_linefeed' => [ + "https://example.com \n", + $normalized_url, + ], + 'trailing_space' => [ + 'https://example.com ', + $normalized_url, + ], + 'enclosed_in_spaces' => [ + ' https://example.com ', + $normalized_url, + ], + 'space_inside' => [ + ' https: //exam ple.com ', + $normalized_url, + ], + 'tabs_inside' => [ + "https:\t//exam\tple.com ", + $normalized_url, + ], + 'leading_slashes' => [ + '//example.com', + ], + 'url_encoded_space_not_removed' => [ + 'https://example.com?foo=++baz', + ], + ]; + } + + /** + * Tests normalize_url_from_attribute_value. + * + * @dataProvider get_normalize_url_data + * @group allowed-tags-private-methods + * @covers AMP_Tag_And_Attribute_Sanitizer::normalize_url_from_attribute_value() + * + * @param array $url The URL to normalize. + * @param string|null $expected The expected return value. + * @throws ReflectionException If it's not possible to create a reflection to call the private method. + */ + public function test_normalize_url_from_attribute_value( $url, $expected = null ) { + if ( null === $expected ) { + $expected = $url; + } + + $dom = AMP_DOM_Utils::get_dom_from_content( '' ); + $sanitizer = new AMP_Tag_And_Attribute_Sanitizer( $dom ); + + $this->assertEquals( + $expected, + $this->call_private_method( $sanitizer, 'normalize_url_from_attribute_value', [ $url ] ) + ); + } + /** * @dataProvider get_check_attr_spec_rule_allowed_protocol * @group allowed-tags-private-methods diff --git a/tests/php/test-tag-and-attribute-sanitizer.php b/tests/php/test-tag-and-attribute-sanitizer.php index a453606f29e..a1ea5c4363c 100644 --- a/tests/php/test-tag-and-attribute-sanitizer.php +++ b/tests/php/test-tag-and-attribute-sanitizer.php @@ -1256,27 +1256,23 @@ static function() { [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL_PROTOCOL ], ], - 'a_with_wrong_host' => [ + 'a_with_space_in_url' => [ 'value', - 'value', - [], - [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL ], + 'value', ], 'a_with_encoded_host' => [ 'value', null, ], - 'a_with_wrong_schemeless_host' => [ - 'value', - 'value', - [], - [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL ], + 'a_with_spaces_inside' => [ + 'value', + 'value', + ], + 'a_with_schemaless_host' => [ + 'value', ], 'a_with_mail_host' => [ - 'value', - 'value', - [], - [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL_PROTOCOL ], + 'value', ], // font is removed so we should check that other elements are checked as well. From f19301917f553dd7cc66d593d2fa0e6a02160e49 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 22:37:54 -0600 Subject: [PATCH 8/9] Use suggested normalization, update tests Use this from the AMP validator, instead of a simple regex, also use trim(). --- .../class-amp-tag-and-attribute-sanitizer.php | 2 +- ...-tag-and-attribute-sanitizer-private-methods.php | 2 +- tests/php/test-tag-and-attribute-sanitizer.php | 13 +++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php index a481db5d81b..6786a213df1 100644 --- a/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php +++ b/includes/sanitizers/class-amp-tag-and-attribute-sanitizer.php @@ -1679,7 +1679,7 @@ private function parse_protocol( $url ) { * @return string The normalized URL. */ private function normalize_url_from_attribute_value( $url ) { - return preg_replace( '/\s/', '', $url ); + return preg_replace( '/[\t\r\n]/', '', trim( $url ) ); } /** diff --git a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php index 2b3b294c02b..6f825713a40 100644 --- a/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php +++ b/tests/php/test-amp-tag-and-attribute-sanitizer-private-methods.php @@ -1715,7 +1715,7 @@ public function get_normalize_url_data() { ], 'space_inside' => [ ' https: //exam ple.com ', - $normalized_url, + 'https: //exam ple.com', ], 'tabs_inside' => [ "https:\t//exam\tple.com ", diff --git a/tests/php/test-tag-and-attribute-sanitizer.php b/tests/php/test-tag-and-attribute-sanitizer.php index a1ea5c4363c..640b0f4ac8c 100644 --- a/tests/php/test-tag-and-attribute-sanitizer.php +++ b/tests/php/test-tag-and-attribute-sanitizer.php @@ -1258,21 +1258,22 @@ static function() { 'a_with_space_in_url' => [ 'value', - 'value', + 'value', + [], + [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL ], ], 'a_with_encoded_host' => [ 'value', null, ], - 'a_with_spaces_inside' => [ - 'value', - 'value', - ], 'a_with_schemaless_host' => [ 'value', ], 'a_with_mail_host' => [ - 'value', + 'value', + 'value', + [], + [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL_PROTOCOL ], ], // font is removed so we should check that other elements are checked as well. From 92bb4e6a6e1e5e102b8cf801cad1b48646ffe545 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Thu, 23 Jan 2020 22:49:15 -0600 Subject: [PATCH 9/9] Revert change in some tests, as they're not needed anymore The behavior of the new parse_url_ method is changed, so these can change back. --- tests/php/test-tag-and-attribute-sanitizer.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/php/test-tag-and-attribute-sanitizer.php b/tests/php/test-tag-and-attribute-sanitizer.php index 640b0f4ac8c..a453606f29e 100644 --- a/tests/php/test-tag-and-attribute-sanitizer.php +++ b/tests/php/test-tag-and-attribute-sanitizer.php @@ -1256,7 +1256,7 @@ static function() { [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL_PROTOCOL ], ], - 'a_with_space_in_url' => [ + 'a_with_wrong_host' => [ 'value', 'value', [], @@ -1266,8 +1266,11 @@ static function() { 'value', null, ], - 'a_with_schemaless_host' => [ - 'value', + 'a_with_wrong_schemeless_host' => [ + 'value', + 'value', + [], + [ AMP_Tag_And_Attribute_Sanitizer::INVALID_URL ], ], 'a_with_mail_host' => [ 'value',