From 9449efda86014d29c4074be1b1923b44607aaf40 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 12 May 2021 16:28:54 +0100 Subject: [PATCH 01/34] Add basic regex to grab site icon --- lib/class-wp-rest-url-details-controller.php | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 3ce15da632aa57..b4a9acd52a006c 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -119,6 +119,7 @@ public function parse_url_details( $request ) { $data = $this->add_additional_fields_to_object( array( 'title' => $this->get_title( $remote_url_response ), + 'icon' => $this->get_icon( $remote_url_response ), ), $request ); @@ -215,6 +216,20 @@ private function get_title( $html ) { return $title; } + /** + * Parses the contents from the provided HTML + * + * @param string $html the HTML from the remote website at URL. + * @return string the title tag contents (maybe empty). + */ + private function get_icon( $html ) { + preg_match( '/<link.*href="(.*\.ico).*".*\/>/i', $html, $matches ); + + $icon = isset( $matches[1] ) ? trim( $matches[1] ) : ''; + + return $icon; + } + /** * Utility function to build cache key for a given URL. * From e381228bd66b65a71bb05e4bb64d8f82c3bf0f77 Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Wed, 19 May 2021 10:36:13 +0100 Subject: [PATCH 02/34] Retrieve meta description --- lib/class-wp-rest-url-details-controller.php | 38 ++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index b4a9acd52a006c..4dd85a989e1c4d 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -118,8 +118,9 @@ public function parse_url_details( $request ) { $data = $this->add_additional_fields_to_object( array( - 'title' => $this->get_title( $remote_url_response ), - 'icon' => $this->get_icon( $remote_url_response ), + 'title' => $this->get_title( $remote_url_response ), + 'icon' => $this->get_icon( $remote_url_response ), + 'description' => $this->get_description( $remote_url_response ), ), $request ); @@ -230,6 +231,39 @@ private function get_icon( $html ) { return $icon; } + /** + * Parses the meta description from the provide HTML. + * + * @param string $html the HTML from the remote website at URL. + * @return string the title tag contents (maybe empty). + */ + private function get_description( $html ) { + $description = ''; + + $temp = tmpfile(); + + if ( ! $temp ) { + return $description; + } + + $path = stream_get_meta_data( $temp )['uri']; + + // Write HTML + fwrite( $temp, $html ); + + $meta = get_meta_tags( $path ); + + if ( empty( $meta ) ) { + return $description; + } + + fclose( $temp ); // clean up tmp file + + $description = ! empty( $meta['description'] ) ? $meta['description'] : ''; + + return $description; + } + /** * Utility function to build cache key for a given URL. * From d6fd2843994a6c384a09131dd9e863b20dc35126 Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Wed, 19 May 2021 10:37:07 +0100 Subject: [PATCH 03/34] Ensure cleanup --- lib/class-wp-rest-url-details-controller.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 4dd85a989e1c4d..58b28fda7eca8d 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -243,6 +243,7 @@ private function get_description( $html ) { $temp = tmpfile(); if ( ! $temp ) { + fclose( $temp ); // clean up tmp file return $description; } @@ -254,13 +255,13 @@ private function get_description( $html ) { $meta = get_meta_tags( $path ); if ( empty( $meta ) ) { + fclose( $temp ); // clean up tmp file return $description; } - fclose( $temp ); // clean up tmp file - $description = ! empty( $meta['description'] ) ? $meta['description'] : ''; + fclose( $temp ); // clean up tmp file return $description; } From a26abba8c90c5099cb3cbbfa2db49d9d988986c5 Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Wed, 19 May 2021 10:49:06 +0100 Subject: [PATCH 04/34] Improve title regex to account for possible attributes on title --- lib/class-wp-rest-url-details-controller.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 58b28fda7eca8d..d2f7a605b4a990 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -210,7 +210,7 @@ private function get_remote_url( $url ) { * @return string the title tag contents (maybe empty). */ private function get_title( $html ) { - preg_match( '|<title>([^<]*?)|is', $html, $match_title ); + preg_match( '|<\s*title[^>]*>(.*?)<\s*/\s*title>|is', $html, $match_title ); $title = isset( $match_title[1] ) ? trim( $match_title[1] ) : ''; From 308fb7c1a795ca8e8a5bdc42901fbc01085417a0 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 11:20:10 +0100 Subject: [PATCH 05/34] Retrieve OG Image --- lib/class-wp-rest-url-details-controller.php | 27 +++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index d2f7a605b4a990..867ea1804e884a 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -121,6 +121,7 @@ public function parse_url_details( $request ) { 'title' => $this->get_title( $remote_url_response ), 'icon' => $this->get_icon( $remote_url_response ), 'description' => $this->get_description( $remote_url_response ), + 'image' => $this->get_image( $remote_url_response ), ), $request ); @@ -212,30 +213,30 @@ private function get_remote_url( $url ) { private function get_title( $html ) { preg_match( '|<\s*title[^>]*>(.*?)<\s*/\s*title>|is', $html, $match_title ); - $title = isset( $match_title[1] ) ? trim( $match_title[1] ) : ''; + $title = isset( $match_title[1] ) && is_string( $match_title[1] ) ? trim( $match_title[1] ) : ''; return $title; } /** - * Parses the contents from the provided HTML + * Parses the site icon from the provided HTML * * @param string $html the HTML from the remote website at URL. - * @return string the title tag contents (maybe empty). + * @return string the icon URI (maybe empty). */ private function get_icon( $html ) { preg_match( '/<link.*href="(.*\.ico).*".*\/>/i', $html, $matches ); - $icon = isset( $matches[1] ) ? trim( $matches[1] ) : ''; + $icon = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; return $icon; } /** - * Parses the meta description from the provide HTML. + * Parses the meta description from the provided HTML. * * @param string $html the HTML from the remote website at URL. - * @return string the title tag contents (maybe empty). + * @return string the meta description contents (maybe empty). */ private function get_description( $html ) { $description = ''; @@ -265,6 +266,20 @@ private function get_description( $html ) { return $description; } + /** + * Parses the Open Graph Image from the provided HTML. + * + * @param string $html the HTML from the remote website at URL. + * @return string the OG image (maybe empty). + */ + private function get_image( $html ) { + preg_match( '|<meta.*?property="og:image.*?".*?content="(.*?)".*?\/?>|is', $html, $matches ); + + $image = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; + + return $image; + } + /** * Utility function to build cache key for a given URL. * From 334495c69bd59f9f856d866a711b420e7049ef3f Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Wed, 19 May 2021 11:46:34 +0100 Subject: [PATCH 06/34] Fix linting --- lib/class-wp-rest-url-details-controller.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 867ea1804e884a..488211e9ba1995 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -244,25 +244,25 @@ private function get_description( $html ) { $temp = tmpfile(); if ( ! $temp ) { - fclose( $temp ); // clean up tmp file + fclose( $temp ); // clean up tmp file. return $description; } $path = stream_get_meta_data( $temp )['uri']; - // Write HTML + // Write HTML. fwrite( $temp, $html ); $meta = get_meta_tags( $path ); if ( empty( $meta ) ) { - fclose( $temp ); // clean up tmp file + fclose( $temp ); // clean up tmp file. return $description; } $description = ! empty( $meta['description'] ) ? $meta['description'] : ''; - fclose( $temp ); // clean up tmp file + fclose( $temp ); // clean up tmp file. return $description; } From 5c0e553cf0f21193c663e8b27b947016ed35b121 Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Wed, 19 May 2021 11:46:46 +0100 Subject: [PATCH 07/34] Fix tests to assert on array subset --- phpunit/class-wp-rest-url-details-controller-test.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 8ff107cab34af3..b45f362d4a8ee8 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -126,7 +126,7 @@ public function test_get_items() { // Note the <title> comes from the fixture HTML returned by // the filter `pre_http_request`. - $this->assertEquals( + $this->assertArraySubset( array( 'title' => 'Example Website — - with encoded content.', ), @@ -385,7 +385,7 @@ function( $response ) { // Instead of the default data retrieved we expect to see the modified // data we provided via the filter. - $this->assertEquals( + $this->assertArraySubset( array( 'title' => 'Example Website — - with encoded content.', 'og_title' => 'This was manually added to the data via filter', From c38854cde2eab9e16092c38d1b223d1620ac7f22 Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Wed, 19 May 2021 11:47:01 +0100 Subject: [PATCH 08/34] Enhance fixture data with more edge cases --- phpunit/fixtures/example-website.html | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/phpunit/fixtures/example-website.html b/phpunit/fixtures/example-website.html index f493c0deafd077..a9a3ab8be2e916 100644 --- a/phpunit/fixtures/example-website.html +++ b/phpunit/fixtures/example-website.html @@ -2,7 +2,7 @@ <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US"> <head> <meta charset="utf-8" /> -<title>Example Website — - with encoded content. +Example Website — - with encoded content. @@ -14,10 +14,11 @@ +

Example Website

-

loreLorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

From a870d2e49f7f2b546cf69f5c7e8d1b820122e2fb Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 12:13:35 +0100 Subject: [PATCH 09/34] Add tests to ensure new properties are captured for icon, description and image. --- lib/class-wp-rest-url-details-controller.php | 8 ++++---- phpunit/class-wp-rest-url-details-controller-test.php | 9 ++++++--- phpunit/fixtures/example-website.html | 3 ++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 488211e9ba1995..867ea1804e884a 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -244,25 +244,25 @@ private function get_description( $html ) { $temp = tmpfile(); if ( ! $temp ) { - fclose( $temp ); // clean up tmp file. + fclose( $temp ); // clean up tmp file return $description; } $path = stream_get_meta_data( $temp )['uri']; - // Write HTML. + // Write HTML fwrite( $temp, $html ); $meta = get_meta_tags( $path ); if ( empty( $meta ) ) { - fclose( $temp ); // clean up tmp file. + fclose( $temp ); // clean up tmp file return $description; } $description = ! empty( $meta['description'] ) ? $meta['description'] : ''; - fclose( $temp ); // clean up tmp file. + fclose( $temp ); // clean up tmp file return $description; } diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index b45f362d4a8ee8..6fa3054a60ce2e 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -124,11 +124,14 @@ public function test_get_items() { $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - // Note the comes from the fixture HTML returned by - // the filter `pre_http_request`. + // Note the data in the subset comes from the fixture HTML returned by + // the filter `pre_http_request` (see this class's `setUp` method). $this->assertArraySubset( array( - 'title' => 'Example Website — - with encoded content.', + 'title' => 'Example Website — - with encoded content.', + 'icon' => '//s.w.org/favicon.ico', + 'description' => 'Example description text here. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore.', + 'image' => 'https://s.w.org/images/home/screen-themes.png?3', ), $data ); diff --git a/phpunit/fixtures/example-website.html b/phpunit/fixtures/example-website.html index a9a3ab8be2e916..a84a741a758aeb 100644 --- a/phpunit/fixtures/example-website.html +++ b/phpunit/fixtures/example-website.html @@ -4,10 +4,11 @@ <meta charset="utf-8" /> <title data-test-title-attr="test">Example Website — - with encoded content. - + + From 98d0f843ca6a93c45c6278b12f5c531bd6cf6b04 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 12:21:39 +0100 Subject: [PATCH 10/34] Add more specific yet flexible test for title --- lib/class-wp-rest-url-details-controller.php | 2 +- phpunit/class-wp-rest-url-details-controller-test.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 867ea1804e884a..fe78ab4a3d4fd1 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -225,7 +225,7 @@ private function get_title( $html ) { * @return string the icon URI (maybe empty). */ private function get_icon( $html ) { - preg_match( '//i', $html, $matches ); + preg_match( '||is', $html, $matches ); $icon = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 6fa3054a60ce2e..d37c89a5abaa09 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -129,7 +129,7 @@ public function test_get_items() { $this->assertArraySubset( array( 'title' => 'Example Website — - with encoded content.', - 'icon' => '//s.w.org/favicon.ico', + 'icon' => '//s.w.org/favicon.ico?querystringaddedfortesting', 'description' => 'Example description text here. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore.', 'image' => 'https://s.w.org/images/home/screen-themes.png?3', ), From 11d375fc05f8765ceb3796ca92ffae5d00d0a3af Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 12:34:23 +0100 Subject: [PATCH 11/34] Handle relative resource URLs for icon and image --- lib/class-wp-rest-url-details-controller.php | 38 ++++++++++++++----- ...ss-wp-rest-url-details-controller-test.php | 4 +- phpunit/fixtures/example-website.html | 4 +- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index fe78ab4a3d4fd1..62305cd88c0b51 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -118,10 +118,10 @@ public function parse_url_details( $request ) { $data = $this->add_additional_fields_to_object( array( - 'title' => $this->get_title( $remote_url_response ), - 'icon' => $this->get_icon( $remote_url_response ), - 'description' => $this->get_description( $remote_url_response ), - 'image' => $this->get_image( $remote_url_response ), + 'title' => $this->get_title( $remote_url_response, $url ), + 'icon' => $this->get_icon( $remote_url_response, $url ), + 'description' => $this->get_description( $remote_url_response, $url ), + 'image' => $this->get_image( $remote_url_response, $url ), ), $request ); @@ -222,13 +222,22 @@ private function get_title( $html ) { * Parses the site icon from the provided HTML * * @param string $html the HTML from the remote website at URL. + * @param string $url the target website URL. * @return string the icon URI (maybe empty). */ - private function get_icon( $html ) { + private function get_icon( $html, $url ) { preg_match( '||is', $html, $matches ); $icon = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; + // Attempt to convert relative URLs to absolute. + if ( ! empty( $icon ) ) { + $parsed = parse_url( $icon ); + if ( empty( $parsed['host'] ) ) { + $icon = $url . $icon; + } + } + return $icon; } @@ -244,25 +253,25 @@ private function get_description( $html ) { $temp = tmpfile(); if ( ! $temp ) { - fclose( $temp ); // clean up tmp file + fclose( $temp ); // clean up tmp file. return $description; } $path = stream_get_meta_data( $temp )['uri']; - // Write HTML + // Write HTML. fwrite( $temp, $html ); $meta = get_meta_tags( $path ); if ( empty( $meta ) ) { - fclose( $temp ); // clean up tmp file + fclose( $temp ); // clean up tmp file. return $description; } $description = ! empty( $meta['description'] ) ? $meta['description'] : ''; - fclose( $temp ); // clean up tmp file + fclose( $temp ); // clean up tmp file. return $description; } @@ -270,13 +279,22 @@ private function get_description( $html ) { * Parses the Open Graph Image from the provided HTML. * * @param string $html the HTML from the remote website at URL. + * @param string $url the target website URL. * @return string the OG image (maybe empty). */ - private function get_image( $html ) { + private function get_image( $html, $url ) { preg_match( '||is', $html, $matches ); $image = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; + // Attempt to convert relative URLs to absolute. + if ( ! empty( $image ) ) { + $parsed = parse_url( $image ); + if ( empty( $parsed['host'] ) ) { + $image = $url . $image; + } + } + return $image; } diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index d37c89a5abaa09..4f0fd76b90df55 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -129,9 +129,9 @@ public function test_get_items() { $this->assertArraySubset( array( 'title' => 'Example Website — - with encoded content.', - 'icon' => '//s.w.org/favicon.ico?querystringaddedfortesting', + 'icon' => 'https://placeholder-site.com/favicon.ico?querystringaddedfortesting', 'description' => 'Example description text here. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore.', - 'image' => 'https://s.w.org/images/home/screen-themes.png?3', + 'image' => 'https://placeholder-site.com/images/home/screen-themes.png?3', ), $data ); diff --git a/phpunit/fixtures/example-website.html b/phpunit/fixtures/example-website.html index a84a741a758aeb..eb3ef57e22208f 100644 --- a/phpunit/fixtures/example-website.html +++ b/phpunit/fixtures/example-website.html @@ -4,7 +4,7 @@ Example Website — - with encoded content. - + @@ -15,7 +15,7 @@ - + From 87f4d330ad762e823af8144db2e0f64e53691d27 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 12:49:08 +0100 Subject: [PATCH 12/34] Use random user agent string to avoid being blocked by certain websites. --- lib/class-wp-rest-url-details-controller.php | 25 ++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 62305cd88c0b51..6b276ce68d176e 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -163,6 +163,8 @@ public function permissions_check() { ); } + + /** * Retrieves the document title from a remote URL. * @@ -173,6 +175,7 @@ private function get_remote_url( $url ) { $args = array( 'limit_response_size' => 150 * KB_IN_BYTES, + 'user-agent' => $this->get_random_user_agent(), ); /** @@ -344,4 +347,26 @@ private function set_cache( $key, $data = '' ) { return set_transient( $key, $data, $cache_expiration ); } + + /** + * Picks a random user agent string from a list of common defaults. + * By default WordPress HTTP functions uses a semi-static string and + * this maybe rejected by many websites. + * + * See: https://core.trac.wordpress.org/browser/tags/5.7.1/src/wp-includes/class-http.php#L191. + * + * @return string the user agent string. + */ + private function get_random_user_agent() { + + $agents = array( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246', // Windows 10-based PC using Edge browser. + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9', // Mac OS X-based computer using a Safari browser. + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1', // Linux-based PC using a Firefox browser. + ); + + $chose = rand( 0, count( $agents ) - 1 ); + + return $agents[ $chose ]; + } } From f25f6da82cf9b61c6924a4ecece0f6c62ee2b4b5 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 13:34:11 +0100 Subject: [PATCH 13/34] Account for open graph image property variations --- lib/class-wp-rest-url-details-controller.php | 9 ++++----- phpunit/fixtures/example-website.html | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 6b276ce68d176e..c1cef13c98b6fe 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -281,21 +281,20 @@ private function get_description( $html ) { /** * Parses the Open Graph Image from the provided HTML. * + * See: https://ogp.me/. + * * @param string $html the HTML from the remote website at URL. * @param string $url the target website URL. * @return string the OG image (maybe empty). */ private function get_image( $html, $url ) { - preg_match( '||is', $html, $matches ); + preg_match( '||is', $html, $matches ); $image = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; // Attempt to convert relative URLs to absolute. if ( ! empty( $image ) ) { - $parsed = parse_url( $image ); - if ( empty( $parsed['host'] ) ) { - $image = $url . $image; - } + $image = \WP_Http::make_absolute_url( $image, $url ); } return $image; diff --git a/phpunit/fixtures/example-website.html b/phpunit/fixtures/example-website.html index eb3ef57e22208f..a5fd3ce44dab60 100644 --- a/phpunit/fixtures/example-website.html +++ b/phpunit/fixtures/example-website.html @@ -15,6 +15,7 @@ + From 1cf283db345059982e5fe449f3e508996f48ac8a Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 13:58:02 +0100 Subject: [PATCH 14/34] Add unit test for get_title --- ...ss-wp-rest-url-details-controller-test.php | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 4f0fd76b90df55..b5aa06d0eed4cc 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -491,6 +491,37 @@ public function test_get_item_schema() { ); } + /** + * @dataProvider provide_get_title_data + */ + public function test_get_title( $html, $expected_title ) { + $controller = new WP_REST_URL_Details_Controller(); + $method = $this->get_reflective_method( 'get_title' ); + $result = $method->invoke( + $controller, + $html + ); + $this->assertEquals( $expected_title, $result ); + } + + + public function provide_get_title_data() { + return array( + 'no_attributes' => array( + 'Testing the title', + 'Testing the title', + ), + 'with_attributes' => array( + 'Testing the title', + 'Testing the title', + ), + ); + } + + + + + public function provide_invalid_url_data() { return array( 'empty_url' => array( @@ -569,4 +600,20 @@ private function mock_request_to_remote_url( $result_type = 'success', $args ) { 'body' => 'success' === $result_type ? file_get_contents( __DIR__ . '/fixtures/example-website.html' ) : '', ); } + + /** + * Get reflective access to a private/protected method on + * the WP_REST_URL_Details_Controller class. + * + * @param string $method_name Method name for which to gain access. + * + * @return ReflectionMethod + * @throws ReflectionException Throws an exception if method does not exist. + */ + protected function get_reflective_method( $method_name ) { + $class = new ReflectionClass( WP_REST_URL_Details_Controller::class ); + $method = $class->getMethod( $method_name ); + $method->setAccessible( true ); + return $method; + } } From a9119bb21bb0fe72d9117d3ee6e503ffcc3ddd52 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:09:31 +0100 Subject: [PATCH 15/34] Add tests (including some failing) for get_icon --- lib/class-wp-rest-url-details-controller.php | 5 +- ...ss-wp-rest-url-details-controller-test.php | 54 ++++++++++++++++++- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index c1cef13c98b6fe..b2c4631965c070 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -235,10 +235,7 @@ private function get_icon( $html, $url ) { // Attempt to convert relative URLs to absolute. if ( ! empty( $icon ) ) { - $parsed = parse_url( $icon ); - if ( empty( $parsed['host'] ) ) { - $icon = $url . $icon; - } + $icon = \WP_Http::make_absolute_url( $icon, $url ); } return $icon; diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index b5aa06d0eed4cc..28cd39c1c7d9b1 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -495,11 +495,12 @@ public function test_get_item_schema() { * @dataProvider provide_get_title_data */ public function test_get_title( $html, $expected_title ) { + $controller = new WP_REST_URL_Details_Controller(); $method = $this->get_reflective_method( 'get_title' ); $result = $method->invoke( $controller, - $html + $html, ); $this->assertEquals( $expected_title, $result ); } @@ -507,7 +508,7 @@ public function test_get_title( $html, $expected_title ) { public function provide_get_title_data() { return array( - 'no_attributes' => array( + 'no_attributes' => array( 'Testing the title', 'Testing the title', ), @@ -518,6 +519,55 @@ public function provide_get_title_data() { ); } + /** + * @dataProvider provide_get_icon_data + */ + public function test_get_icon( $html, $expected_icon ) { + $target_url = 'https://wordpress.org'; + $controller = new WP_REST_URL_Details_Controller(); + $method = $this->get_reflective_method( 'get_icon' ); + $result = $method->invoke( + $controller, + $html, + $target_url + ); + $this->assertEquals( $expected_icon, $result ); + } + + + public function provide_get_icon_data() { + return array( + 'default' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + 'with_query_string' => array( + '', + 'https://wordpress.org/favicon.ico?somequerystring=foo&another=bar', + ), + 'relative_url' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + 'relative_url_no_slash' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + 'rel_reverse_order' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + 'rel_icon_only' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + 'rel_shortcut_only' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + ); + } + From 9ba3530ec417395527826b89f6ed33f9a9dc3621 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:10:38 +0100 Subject: [PATCH 16/34] Fix method invocation to remove unused args --- lib/class-wp-rest-url-details-controller.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index b2c4631965c070..7710e3deb64def 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -118,9 +118,9 @@ public function parse_url_details( $request ) { $data = $this->add_additional_fields_to_object( array( - 'title' => $this->get_title( $remote_url_response, $url ), + 'title' => $this->get_title( $remote_url_response ), 'icon' => $this->get_icon( $remote_url_response, $url ), - 'description' => $this->get_description( $remote_url_response, $url ), + 'description' => $this->get_description( $remote_url_response ), 'image' => $this->get_image( $remote_url_response, $url ), ), $request From afe0af98684dfed1bf5f6c48be63424753d46363 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:18:45 +0100 Subject: [PATCH 17/34] Wrap test HTML string in a basic HTML doc. --- ...ss-wp-rest-url-details-controller-test.php | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 28cd39c1c7d9b1..807e33667e8f4c 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -500,7 +500,7 @@ public function test_get_title( $html, $expected_title ) { $method = $this->get_reflective_method( 'get_title' ); $result = $method->invoke( $controller, - $html, + $this->wrap_html_in_doc( $html ), ); $this->assertEquals( $expected_title, $result ); } @@ -528,24 +528,26 @@ public function test_get_icon( $html, $expected_icon ) { $method = $this->get_reflective_method( 'get_icon' ); $result = $method->invoke( $controller, - $html, + $this->wrap_html_in_doc( $html ), $target_url ); $this->assertEquals( $expected_icon, $result ); } + + public function provide_get_icon_data() { return array( - 'default' => array( + 'default' => array( '', 'https://wordpress.org/favicon.ico', ), - 'with_query_string' => array( + 'with_query_string' => array( '', 'https://wordpress.org/favicon.ico?somequerystring=foo&another=bar', ), - 'relative_url' => array( + 'relative_url' => array( '', 'https://wordpress.org/favicon.ico', ), @@ -553,15 +555,15 @@ public function provide_get_icon_data() { '', 'https://wordpress.org/favicon.ico', ), - 'rel_reverse_order' => array( + 'rel_reverse_order' => array( '', 'https://wordpress.org/favicon.ico', ), - 'rel_icon_only' => array( + 'rel_icon_only' => array( '', 'https://wordpress.org/favicon.ico', ), - 'rel_shortcut_only' => array( + 'rel_shortcut_only' => array( '', 'https://wordpress.org/favicon.ico', ), @@ -651,6 +653,19 @@ private function mock_request_to_remote_url( $result_type = 'success', $args ) { ); } + private function wrap_html_in_doc( $html ) { + $start = ' + +'; + $end = ' + +

Example Website

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+ +'; + return $start . $html . $end; + } + /** * Get reflective access to a private/protected method on * the WP_REST_URL_Details_Controller class. From 57d1389cb975291ef7d7bfa02424d6baf1386b3d Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:28:47 +0100 Subject: [PATCH 18/34] Parse the head section and use for comparison --- lib/class-wp-rest-url-details-controller.php | 24 ++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 7710e3deb64def..e1e3afbfb075d4 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -116,12 +116,14 @@ public function parse_url_details( $request ) { $this->set_cache( $cache_key, $remote_url_response ); } + $html_head = $this->get_document_head( $remote_url_response ); + $data = $this->add_additional_fields_to_object( array( - 'title' => $this->get_title( $remote_url_response ), - 'icon' => $this->get_icon( $remote_url_response, $url ), - 'description' => $this->get_description( $remote_url_response ), - 'image' => $this->get_image( $remote_url_response, $url ), + 'title' => $this->get_title( $html_head ), + 'icon' => $this->get_icon( $html_head, $url ), + 'description' => $this->get_description( $html_head ), + 'image' => $this->get_image( $html_head, $url ), ), $request ); @@ -140,6 +142,8 @@ public function parse_url_details( $request ) { return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response ); } + + /** * Checks whether a given request has permission to read remote urls. * @@ -365,4 +369,16 @@ private function get_random_user_agent() { return $agents[ $chose ]; } + + /** + * Retrieves the section (including opening tag from an HTML string if present. + * + * @param string $html the string of HTML to return the section. + * @return string the section (may be empty). + */ + private function get_document_head( $html ) { + preg_match( '|([\s\S]*)|is', $html, $matches ); + $head = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; + return $head; + } } From 6d83c509c987e484dfb84db967b68a4fd3dd1928 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:35:30 +0100 Subject: [PATCH 19/34] Fix broken cache test --- phpunit/class-wp-rest-url-details-controller-test.php | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 807e33667e8f4c..3bc0812ce99db8 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -327,7 +327,7 @@ public function test_will_return_from_cache_if_populated() { add_filter( "pre_transient_$transient_name", function() { - return 'This value from cache.'; + return 'This value from cache.'; } ); @@ -534,9 +534,6 @@ public function test_get_icon( $html, $expected_icon ) { $this->assertEquals( $expected_icon, $result ); } - - - public function provide_get_icon_data() { return array( 'default' => array( From c0bee0ac86b5ca2c871384ddb67bba9c28227f72 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:40:54 +0100 Subject: [PATCH 20/34] Refine wrap method --- ...ss-wp-rest-url-details-controller-test.php | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 3bc0812ce99db8..215db64b36e8c6 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -651,16 +651,17 @@ private function mock_request_to_remote_url( $result_type = 'success', $args ) { } private function wrap_html_in_doc( $html ) { - $start = ' - -'; - $end = ' - -

Example Website

-

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

- -'; - return $start . $html . $end; + $doc = ' + + + %%HEAD_CONTENT%% + + +

Example Website

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+ + '; + return str_replace( '%%HEAD_CONTENT%%', $html, $doc ); } /** From d740527655d97a4ca5cbdb19c0d96b9b0d631e37 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 14:52:11 +0100 Subject: [PATCH 21/34] Add get_image tests --- ...ss-wp-rest-url-details-controller-test.php | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 215db64b36e8c6..bf72166de877f0 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -567,6 +567,57 @@ public function provide_get_icon_data() { ); } + /** + * @dataProvider provide_get_image_data + */ + public function test_get_image( $html, $expected_image ) { + $target_url = 'https://wordpress.org'; + $controller = new WP_REST_URL_Details_Controller(); + $method = $this->get_reflective_method( 'get_image' ); + $result = $method->invoke( + $controller, + $this->wrap_html_in_doc( $html ), + $target_url + ); + $this->assertEquals( $expected_image, $result ); + } + + public function provide_get_image_data() { + return array( + 'default' => array( + '', + 'https://wordpress.org/images/myimage.jpg', + ), + 'no_closing_tag' => array( + '', + 'https://wordpress.org/images/myimage.jpg', + ), + 'using_url_modifier' => array( + ' + ', + 'https://wordpress.org/images/myimage.jpg', + ), + 'should_ignore_other_modifiers' => array( + ' + + ', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with_query_string' => array( + '', + 'https://wordpress.org/images/myimage.jpg?foo=bar&bar=foo', + ), + 'relative_url' => array( + '', + 'https://wordpress.org/images/myimage.jpg', + ), + 'relative_url_no_slash' => array( + '', + 'https://wordpress.org/images/myimage.jpg', + ), + ); + } + From fc65afbb3fd397a055a26a18ea53e9a97c1d1a38 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 19 May 2021 15:03:41 +0100 Subject: [PATCH 22/34] Handle relative URLs when target url has a path --- lib/class-wp-rest-url-details-controller.php | 8 +++-- ...ss-wp-rest-url-details-controller-test.php | 32 ++++++++++++------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index e1e3afbfb075d4..2a61b77f42c7f2 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -239,7 +239,9 @@ private function get_icon( $html, $url ) { // Attempt to convert relative URLs to absolute. if ( ! empty( $icon ) ) { - $icon = \WP_Http::make_absolute_url( $icon, $url ); + $parsed_url = parse_url( $url ); + $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; + $icon = \WP_Http::make_absolute_url( $icon, $root_url ); } return $icon; @@ -295,7 +297,9 @@ private function get_image( $html, $url ) { // Attempt to convert relative URLs to absolute. if ( ! empty( $image ) ) { - $image = \WP_Http::make_absolute_url( $image, $url ); + $parsed_url = parse_url( $url ); + $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; + $image = \WP_Http::make_absolute_url( $image, $root_url ); } return $image; diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index bf72166de877f0..aa77ee94b14f2f 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -522,8 +522,8 @@ public function provide_get_title_data() { /** * @dataProvider provide_get_icon_data */ - public function test_get_icon( $html, $expected_icon ) { - $target_url = 'https://wordpress.org'; + public function test_get_icon( $html, $expected_icon, $target_url = 'https://wordpress.org' ) { + $controller = new WP_REST_URL_Details_Controller(); $method = $this->get_reflective_method( 'get_icon' ); $result = $method->invoke( @@ -536,31 +536,36 @@ public function test_get_icon( $html, $expected_icon ) { public function provide_get_icon_data() { return array( - 'default' => array( + 'default' => array( '', 'https://wordpress.org/favicon.ico', ), - 'with_query_string' => array( + 'with_query_string' => array( '', 'https://wordpress.org/favicon.ico?somequerystring=foo&another=bar', ), - 'relative_url' => array( + 'relative_url' => array( '', 'https://wordpress.org/favicon.ico', ), - 'relative_url_no_slash' => array( + 'relative_url_no_slash' => array( + '', + 'https://wordpress.org/favicon.ico', + ), + 'relative_url_with_path' => array( '', 'https://wordpress.org/favicon.ico', + 'https://wordpress.org/my/path/here/', ), - 'rel_reverse_order' => array( + 'rel_reverse_order' => array( '', 'https://wordpress.org/favicon.ico', ), - 'rel_icon_only' => array( + 'rel_icon_only' => array( '', 'https://wordpress.org/favicon.ico', ), - 'rel_shortcut_only' => array( + 'rel_shortcut_only' => array( '', 'https://wordpress.org/favicon.ico', ), @@ -570,8 +575,8 @@ public function provide_get_icon_data() { /** * @dataProvider provide_get_image_data */ - public function test_get_image( $html, $expected_image ) { - $target_url = 'https://wordpress.org'; + public function test_get_image( $html, $expected_image, $target_url = 'https://wordpress.org' ) { + $controller = new WP_REST_URL_Details_Controller(); $method = $this->get_reflective_method( 'get_image' ); $result = $method->invoke( @@ -615,6 +620,11 @@ public function provide_get_image_data() { '', 'https://wordpress.org/images/myimage.jpg', ), + 'relative_url_with_path' => array( + '', + 'https://wordpress.org/images/myimage.jpg', + 'https://wordpress.org/my/path/here/', + ), ); } From cc9cf7f7af90d8882a0efe33e009c25a09d0f646 Mon Sep 17 00:00:00 2001 From: Tonya Mork Date: Thu, 20 May 2021 07:32:32 -0500 Subject: [PATCH 23/34] Improves title and icon parsing for PR 31763 (#32021) * Title: removes malformed opening tag pattern and adds tests. * Icon: Allows for different ordering of attribute. Adds happy and unhappy test data. * Icon: allow for any order or combination of attributes. How? Get the icon link element first. Then grab its href. Benefits: - Not dependent upon the order of attributes - Allows for optional or custom attributes * Icon: allows for single, double, or no quotes around attributes. * Update for WPCS standard. --- lib/class-wp-rest-url-details-controller.php | 36 ++-- ...ss-wp-rest-url-details-controller-test.php | 177 +++++++++++++++--- 2 files changed, 169 insertions(+), 44 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 2a61b77f42c7f2..5f6fb7ea68fcc8 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -214,11 +214,11 @@ private function get_remote_url( $url ) { /** * Parses the contents from the provided HTML * - * @param string $html the HTML from the remote website at URL. - * @return string the title tag contents (maybe empty). + * @param string $html The HTML from the remote website at URL. + * @return string The title tag contents on success; else empty string. */ private function get_title( $html ) { - preg_match( '|<\s*title[^>]*>(.*?)<\s*/\s*title>|is', $html, $match_title ); + preg_match( '|<title[^>]*>(.*?)<\s*/\s*title>|is', $html, $match_title ); $title = isset( $match_title[1] ) && is_string( $match_title[1] ) ? trim( $match_title[1] ) : ''; @@ -228,21 +228,31 @@ private function get_title( $html ) { /** * Parses the site icon from the provided HTML * - * @param string $html the HTML from the remote website at URL. - * @param string $url the target website URL. - * @return string the icon URI (maybe empty). + * @param string $html The HTML from the remote website at URL. + * @param string $url The target website URL. + * @return string The icon URI on success; else empty string. */ private function get_icon( $html, $url ) { - preg_match( '|<link.*?rel="\s*[shortcut]+(?:\s+[icon]+)*\s*".*?href="(.*?)".*?\/?>|is', $html, $matches ); + // Grab the icon's link element. + $pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU'; + preg_match( $pattern, $html, $element ); + $element = ! empty( $element[0] ) && is_string( $element[0] ) ? trim( $element[0] ) : ''; + if ( empty( $element ) ) { + return ''; + } - $icon = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; + // Get the icon's href value. + $pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU'; + preg_match( $pattern, $element, $icon ); + $icon = ! empty( $icon[2] ) && is_string( $icon[2] ) ? trim( $icon[2] ) : ''; + if ( empty( $icon ) ) { + return ''; + } // Attempt to convert relative URLs to absolute. - if ( ! empty( $icon ) ) { - $parsed_url = parse_url( $url ); - $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; - $icon = \WP_Http::make_absolute_url( $icon, $root_url ); - } + $parsed_url = parse_url( $url ); + $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; + $icon = WP_Http::make_absolute_url( $icon, $root_url ); return $icon; } diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index aa77ee94b14f2f..94b6930dee8a15 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -494,81 +494,192 @@ public function test_get_item_schema() { /** * @dataProvider provide_get_title_data */ - public function test_get_title( $html, $expected_title ) { - + public function test_get_title( $html, $expected ) { $controller = new WP_REST_URL_Details_Controller(); $method = $this->get_reflective_method( 'get_title' ); - $result = $method->invoke( + + $actual = $method->invoke( $controller, - $this->wrap_html_in_doc( $html ), + $this->wrap_html_in_doc( $html ) ); - $this->assertEquals( $expected_title, $result ); + $this->assertSame( $expected, $actual ); } public function provide_get_title_data() { return array( - 'no_attributes' => array( - '<title>Testing the title', - 'Testing the title', + 'no attributes' => array( + 'Testing <title>:', + 'Testing <title>:', ), - 'with_attributes' => array( - 'Testing the title', - 'Testing the title', + 'with attributes' => array( + 'Testing <title>:', + 'Testing <title>:', + ), + 'with text whitespace' => array( + ' Testing <title>: ', + 'Testing <title>:', + ), + 'when opening tag is malformed' => array( + '< title>Testing <title>: when opening tag is invalid', + '', + ), + 'with whitespace in opening tag' => array( + 'Testing <title>: with whitespace in opening tag', + 'Testing <title>: with whitespace in opening tag', + ), + 'when whitepace in closing tag' => array( + 'Testing <title>: with whitespace in closing tag</ title>', + 'Testing <title>: with whitespace in closing tag', ), ); } /** - * @dataProvider provide_get_icon_data + * @dataProvider data_get_icon */ - public function test_get_icon( $html, $expected_icon, $target_url = 'https://wordpress.org' ) { - + public function test_get_icon( $html, $expected, $target_url = 'https://wordpress.org' ) { $controller = new WP_REST_URL_Details_Controller(); $method = $this->get_reflective_method( 'get_icon' ); - $result = $method->invoke( + + $actual = $method->invoke( $controller, $this->wrap_html_in_doc( $html ), $target_url ); - $this->assertEquals( $expected_icon, $result ); + $this->assertSame( $expected, $actual ); } - public function provide_get_icon_data() { + public function data_get_icon() { return array( - 'default' => array( + + // Happy path for default. + 'default' => array( '<link rel="shortcut icon" href="https://wordpress.org/favicon.ico" />', 'https://wordpress.org/favicon.ico', ), - 'with_query_string' => array( + 'default with no closing whitespace' => array( + '<link rel="shortcut icon" href="https://wordpress.org/favicon.ico"/>', + 'https://wordpress.org/favicon.ico', + ), + 'default without self-closing' => array( + '<link rel="shortcut icon" href="https://wordpress.org/favicon.ico">', + 'https://wordpress.org/favicon.ico', + ), + 'default with href first' => array( + '<link href="https://wordpress.org/favicon.ico" rel="shortcut icon" />', + 'https://wordpress.org/favicon.ico', + ), + 'default with type last' => array( + '<link href="https://wordpress.org/favicon.png" rel="icon" type="image/png" />', + 'https://wordpress.org/favicon.png', + ), + 'default with type first' => array( + '<link type="image/png" href="https://wordpress.org/favicon.png" rel="icon" />', + 'https://wordpress.org/favicon.png', + ), + 'default with single quotes' => array( + '<link type="image/png" href=\'https://wordpress.org/favicon.png\' rel=\'icon\' />', + 'https://wordpress.org/favicon.png', + ), + + // Happy paths. + 'with query string' => array( '<link rel="shortcut icon" href="https://wordpress.org/favicon.ico?somequerystring=foo&another=bar" />', 'https://wordpress.org/favicon.ico?somequerystring=foo&another=bar', ), - 'relative_url' => array( + 'with another link' => array( + '<link rel="shortcut icon" href="https://wordpress.org/favicon.ico" /><link rel="canonical" href="https://example.com">', + 'https://wordpress.org/favicon.ico', + ), + 'relative url' => array( '<link rel="shortcut icon" href="/favicon.ico" />', 'https://wordpress.org/favicon.ico', ), - 'relative_url_no_slash' => array( + 'relative url no slash' => array( '<link rel="shortcut icon" href="favicon.ico" />', 'https://wordpress.org/favicon.ico', ), - 'relative_url_with_path' => array( + 'relative url with path' => array( '<link rel="shortcut icon" href="favicon.ico" />', 'https://wordpress.org/favicon.ico', 'https://wordpress.org/my/path/here/', ), - 'rel_reverse_order' => array( + 'rel reverse order' => array( '<link rel="icon shortcut" href="https://wordpress.org/favicon.ico" />', 'https://wordpress.org/favicon.ico', ), - 'rel_icon_only' => array( + 'rel icon only' => array( '<link rel="icon" href="https://wordpress.org/favicon.ico" />', 'https://wordpress.org/favicon.ico', ), - 'rel_shortcut_only' => array( - '<link rel="icon" href="https://wordpress.org/favicon.ico" />', + 'rel icon only with whitespace' => array( + '<link rel=" icon " href="https://wordpress.org/favicon.ico" />', + 'https://wordpress.org/favicon.ico', + ), + 'multiline attributes' => array( + '<link + rel="icon" + href="https://wordpress.org/favicon.ico" + />', + 'https://wordpress.org/favicon.ico', + ), + 'multiline attributes in reverse order' => array( + '<link + rel="icon" + href="https://wordpress.org/favicon.ico" + />', + 'https://wordpress.org/favicon.ico', + ), + 'multiline attributes with type' => array( + '<link + rel="icon" + href="https://wordpress.org/favicon.ico" + type="image/x-icon" + />', 'https://wordpress.org/favicon.ico', ), + 'multiline with type first' => array( + '<link + type="image/x-icon" + rel="icon" + href="https://wordpress.org/favicon.ico" + />', + 'https://wordpress.org/favicon.ico', + ), + + // Unhappy paths. + 'empty rel' => array( + '<link rel="" href="https://wordpress.org/favicon.ico" />', + '', + ), + 'empty href' => array( + '<link rel="icon" href="" />', + '', + ), + 'no rel' => array( + '<link href="https://wordpress.org/favicon.ico" />', + '', + ), + 'link to external stylesheet' => array( + '<link rel="stylesheet" href="https://example.com/assets/style.css" />', + '', + 'https://example.com', + ), + 'multiline with no href' => array( + '<link + rel="icon" + href="" + />', + '', + ), + 'multiline with no rel' => array( + '<link + rel="" + href="https://wordpress.org/favicon.ico" + />', + '', + ), ); } @@ -711,18 +822,22 @@ private function mock_request_to_remote_url( $result_type = 'success', $args ) { ); } - private function wrap_html_in_doc( $html ) { + private function wrap_html_in_doc( $html, $with_body = false ) { $doc = '<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en-US"> <head> - %%HEAD_CONTENT%% - </head> + <meta charset="utf-8" />' . $html . "\n" . '</head>'; + + if ( $with_body ) { + $doc .= ' <body> <h1>Example Website</h1> <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p> </body> - </html>'; - return str_replace( '%%HEAD_CONTENT%%', $html, $doc ); + </html>'; + } + + return $doc; } /** From 2eae9a72737803a92cde5ccaac7760e05eabbeb7 Mon Sep 17 00:00:00 2001 From: Dave Smith <getdavemail@gmail.com> Date: Thu, 20 May 2021 15:53:48 +0100 Subject: [PATCH 24/34] Seek head but fallback to body. --- lib/class-wp-rest-url-details-controller.php | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 5f6fb7ea68fcc8..769d0afd886040 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -391,8 +391,16 @@ private function get_random_user_agent() { * @return string the <head> section (may be empty). */ private function get_document_head( $html ) { - preg_match( '|([\s\S]*)<body>|is', $html, $matches ); - $head = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; - return $head; + preg_match( '|([\s\S]*)</head>|is', $html, $head_matches ); + + $doc_head = isset( $head_matches[1] ) && is_string( $head_matches[1] ) ? trim( $head_matches[1] ) : ''; + + // If missing `</head>` then look for opening <body>. + if ( empty( $doc_head ) ) { + preg_match( '|([\s\S]*)<body>|is', $html, $body_matches ); + $head = isset( $body_matches[1] ) && is_string( $body_matches[1] ) ? trim( $body_matches[1] ) : ''; + } + + return $doc_head; } } From 04e3e21a926ef42488fe0f7f6e893c683ebc039c Mon Sep 17 00:00:00 2001 From: Tonya Mork <hello@hellofromtonya.com> Date: Mon, 24 May 2021 03:31:45 -0500 Subject: [PATCH 25/34] Improves metadata parsing for PR 31763 (#32067) * Description: uses regex instead of tmp file. * Adding test to check for like tag before and after target. * Description: changes regex strategy. Why? Lookahead was not constrained with each element and thus picked up <meta from one and then if not a match, grabbed the name and content from another upstream. The new strategy parses all meta elements with a content attribute. Then loops through them to find the description element. Why this order? The content attribute can contain HTML tags. The > or /> symbol is matched as the end of the meta element (it's closing symbol). If this happens, the content is truncated. Boo. Switching the parsing order solves this problem. Bonus: allows for pre-parsing of all meta elements. Performance boost. * Refactors getting meta with content elements for reuse. * Improves getting <head>..</head> element. - Isolates to the only the <head>..</head> element by stripping all content before the opening tag and ensuring it includes a closing </head> tag. - Performance improvements: - Bails out early if no opening tag is found. - Uses native string functions instead of regex. * Image: use same parsing strategy as description. * Refactor to reuse the process for getting the metadata from the list of meta elements. * Convert description HTML entities into HTML. --- lib/class-wp-rest-url-details-controller.php | 255 +++++++++++++--- ...ss-wp-rest-url-details-controller-test.php | 287 ++++++++++++++++-- 2 files changed, 473 insertions(+), 69 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 769d0afd886040..b221d391856288 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -116,14 +116,15 @@ public function parse_url_details( $request ) { $this->set_cache( $cache_key, $remote_url_response ); } - $html_head = $this->get_document_head( $remote_url_response ); + $html_head = $this->get_document_head( $remote_url_response ); + $meta_elements = $this->get_meta_with_content_elements( $html_head ); $data = $this->add_additional_fields_to_object( array( 'title' => $this->get_title( $html_head ), 'icon' => $this->get_icon( $html_head, $url ), - 'description' => $this->get_description( $html_head ), - 'image' => $this->get_image( $html_head, $url ), + 'description' => $this->get_description( $meta_elements ), + 'image' => $this->get_image( $meta_elements, $url ), ), $request ); @@ -260,34 +261,31 @@ private function get_icon( $html, $url ) { /** * Parses the meta description from the provided HTML. * - * @param string $html the HTML from the remote website at URL. - * @return string the meta description contents (maybe empty). + * @param array $meta_elements { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + * @return string The meta description contents on success, else empty string. */ - private function get_description( $html ) { - $description = ''; - - $temp = tmpfile(); - - if ( ! $temp ) { - fclose( $temp ); // clean up tmp file. - return $description; + private function get_description( $meta_elements ) { + // Bail out if there are no meta elements. + if ( empty( $meta_elements[0] ) ) { + return ''; } - $path = stream_get_meta_data( $temp )['uri']; - - // Write HTML. - fwrite( $temp, $html ); - - $meta = get_meta_tags( $path ); + $description = $this->get_metadata_from_meta_element( $meta_elements, 'name', '\bdescription\b' ); - if ( empty( $meta ) ) { - fclose( $temp ); // clean up tmp file. - return $description; + // Bail out if description not found. + if ( '' === $description ) { + return ''; } - $description = ! empty( $meta['description'] ) ? $meta['description'] : ''; + // Convert any entities to HTML for use downstream. + $description = html_entity_decode( $description, ENT_QUOTES, get_bloginfo( 'charset' ) ); - fclose( $temp ); // clean up tmp file. return $description; } @@ -296,21 +294,28 @@ private function get_description( $html ) { * * See: https://ogp.me/. * - * @param string $html the HTML from the remote website at URL. - * @param string $url the target website URL. - * @return string the OG image (maybe empty). + * @param array $meta_elements { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + * @param string $url The target website URL. + * @return string The OG image on success, or empty string. */ - private function get_image( $html, $url ) { - preg_match( '|<meta.*?property="og:image[:url]*?".*?content="(.*?)".*?\/?>|is', $html, $matches ); + private function get_image( $meta_elements, $url ) { + $image = $this->get_metadata_from_meta_element( $meta_elements, 'property', '(?:og:image|og:image:url)' ); - $image = isset( $matches[1] ) && is_string( $matches[1] ) ? trim( $matches[1] ) : ''; + // Bail out if image not found. + if ( '' === $image ) { + return ''; + } // Attempt to convert relative URLs to absolute. - if ( ! empty( $image ) ) { - $parsed_url = parse_url( $url ); - $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; - $image = \WP_Http::make_absolute_url( $image, $root_url ); - } + $parsed_url = parse_url( $url ); + $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; + $image = WP_Http::make_absolute_url( $image, $root_url ); return $image; } @@ -385,22 +390,182 @@ private function get_random_user_agent() { } /** - * Retrieves the <head> section (including opening <body> tag from an HTML string if present. + * Retrieves the `<head>` section. * - * @param string $html the string of HTML to return the <head> section. - * @return string the <head> section (may be empty). + * @param string $html The string of HTML to parse. + * @return string The `<head>..</head>` section on succes, or original HTML. */ private function get_document_head( $html ) { - preg_match( '|([\s\S]*)</head>|is', $html, $head_matches ); + $head_html = $html; + + // Find the opening `<head>` tag. + $head_start = strpos( $html, '<head' ); + if ( false === $head_start ) { + // Didn't find it. Return the original HTML. + return $html; + } + + // Find the closing `</head>` tag. + $head_end = strpos( $head_html, '</head>' ); + if ( false === $head_end ) { + // Didn't find it. Find the opening `<body>` tag. + $head_end = strpos( $head_html, '<body' ); + + // Didn't find it. Return the original HTML. + if ( false === $head_end ) { + return $html; + } + } + + // Extract the HTML from opening tag to the closing tag. Then add the closing tag. + $head_html = substr( $head_html, $head_start, $head_end ); + $head_html .= '</head>'; + + return $head_html; + } - $doc_head = isset( $head_matches[1] ) && is_string( $head_matches[1] ) ? trim( $head_matches[1] ) : ''; + /** + * Gets all the <meta> elements that have a `content` attribute. + * + * @param string $html The string of HTML to be parsed. + * @return array { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + */ + private function get_meta_with_content_elements( $html ) { + /* + * Parse all meta elements with a content attribute. + * + * Why first search for the content attribute rather than directly searching for name=description element? + * tl;dr The content attribute's value will be truncated when it contains a > symbol. + * + * The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as + * it's a string to the browser. Imagine what happens when attempting to match for the name=description + * first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match + * as the element's closing symbol. But wait, it's in the content attribute and is not the end of the + * element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation". + * If this happens, what gets matched is not the entire element or all of the content. + * + * Why not search for the name=description and then content="(.*)"? + * The attribute order could be opposite. Plus, additional attributes may exist including being between + * the name and content attributes. + * + * Why not lookahead? + * Lookahead is not constrained to stay within the element. The first <meta it finds may not include + * the name or content, but rather could be from a different element downstream. + */ + $pattern = '#<meta\s' . + + /* + * Alows for additional attributes before the content attribute. + * Searches for anything other than > symbol. + */ + '[^>]*' . + + /* + * Find the content attribute. When found, capture its value (.*). + * + * Allows for (a) single or double quotes and (b) whitespace in the value. + * + * Why capture the opening quotation mark, i.e. (["\']), and then backreference, + * i.e \1, for the closing quotation mark? + * To ensure the closing quotation mark matches the opening one. Why? Attribute values + * can contain quotation marks, such as an apostrophe in the content. + */ + 'content=(["\']??)(.*)\1' . + + /* + * Alows for additional attributes after the content attribute. + * Searches for anything other than > symbol. + */ + '[^>]*' . + + /* + * \/?> searches for the closing > symbol, which can be in either /> or > format. + * # ends the pattern. + */ + '\/?>#' . + + /* + * These are the options: + * - i : case insensitive + * - s : allows newline characters for the . match (needed for multiline elements) + * - U means non-greedy matching + */ + 'isU'; + + preg_match_all( $pattern, $html, $elements ); + + return $elements; + } + + /** + * Gets the metadata from a target meta element. + * + * @param array $meta_elements { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + * @param string $attr Attribute that identifies the element with the target metadata. + * @param string $attr_value The attribute's value that identifies the element with the target metadata. + * @return string The metadata on success, or an empty string. + */ + private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) { + // Bail out if there are no meta elements. + if ( empty( $meta_elements[0] ) ) { + return ''; + } + + $metadata = ''; + $pattern = '#' . + + /* + * Target this attribute and value to find the metadata element. + * + * Allows for (a) no, single, double quotes and (b) whitespace in the value. + * + * Why capture the opening quotation mark, i.e. (["\']), and then backreference, + * i.e \1, for the closing quotation mark? + * To ensure the closing quotation mark matches the opening one. Why? Attribute values + * can contain quotation marks, such as an apostrophe in the content. + */ + $attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' . + + /* + * These are the options: + * - i : case insensitive + * - s : allows newline characters for the . match (needed for multiline elements) + * - U means non-greedy matching + */ + '#isU'; + + // Find the metdata element. + foreach ( $meta_elements[0] as $index => $element ) { + preg_match( $pattern, $element, $match ); + + // This is not the metadata element. Skip it. + if ( empty( $match ) ) { + continue; + } + + /* + * Found the metadata element. + * Get the metadata from its matching content array. + */ + if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) { + $metadata = trim( $meta_elements[2][ $index ] ); + } - // If missing `</head>` then look for opening <body>. - if ( empty( $doc_head ) ) { - preg_match( '|([\s\S]*)<body>|is', $html, $body_matches ); - $head = isset( $body_matches[1] ) && is_string( $body_matches[1] ) ? trim( $body_matches[1] ) : ''; + break; } - return $doc_head; + return $metadata; } } diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 94b6930dee8a15..02ba92e7b80ce6 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -592,6 +592,12 @@ public function data_get_icon() { '<link rel="shortcut icon" href="https://wordpress.org/favicon.ico" /><link rel="canonical" href="https://example.com">', 'https://wordpress.org/favicon.ico', ), + 'with multiple links' => array( + '<link rel="manifest" href="/manifest.56b1cedc.json"> + <link rel="shortcut icon" href="https://wordpress.org/favicon.ico" /> + <link rel="canonical" href="https://example.com">', + 'https://wordpress.org/favicon.ico', + ), 'relative url' => array( '<link rel="shortcut icon" href="/favicon.ico" />', 'https://wordpress.org/favicon.ico', @@ -684,58 +690,291 @@ public function data_get_icon() { } /** - * @dataProvider provide_get_image_data + * @dataProvider data_get_description */ - public function test_get_image( $html, $expected_image, $target_url = 'https://wordpress.org' ) { + public function test_get_description( $html, $expected ) { + $controller = new WP_REST_URL_Details_Controller(); + + // Parse the meta elements from the given HTML. + $method = $this->get_reflective_method( 'get_meta_with_content_elements' ); + $meta_elements = $method->invoke( + $controller, + $this->wrap_html_in_doc( $html ) + ); + + $method = $this->get_reflective_method( 'get_description' ); + $actual = $method->invoke( $controller, $meta_elements ); + $this->assertSame( $expected, $actual ); + } + + public function data_get_description() { + return array( + // Happy paths. + 'default' => array( + '<meta name="description" content="This is a description.">', + 'This is a description.', + ), + 'with whitespace' => array( + '<meta name=" description " content=" This is a description. " >', + 'This is a description.', + ), + 'with self-closing' => array( + '<meta name="description" content="This is a description."/>', + 'This is a description.', + ), + 'with self-closing and whitespace' => array( + '<meta name=" description " content=" This is a description. " />', + 'This is a description.', + ), + 'with content first' => array( + '<meta content="Content is first" name="description">', + 'Content is first', + ), + 'with single quotes' => array( + '<meta name=\'description\' content=\'with single quotes\'>', + 'with single quotes', + ), + 'with another element' => array( + '<meta name="description" content="This is a description."><meta name="viewport" content="width=device-width, initial-scale=1">', + 'This is a description.', + ), + 'with multiple elements' => array( + '<meta property="og:image" content="https://wordpress.org/images/myimage.jpg" /> + <link rel="stylesheet" href="https://example.com/assets/style.css" /> + <meta name="description" content="This is a description."> + <meta name="viewport" content="width=device-width, initial-scale=1">', + 'This is a description.', + ), + 'with other attributes' => array( + '<meta first="first" name="description" third="third" content="description with other attributes" fifth="fifth">', + 'description with other attributes', + ), + + // Happy paths with multiline attributes. + 'with multiline attributes' => array( + '<meta + name="description" + content="with multiline attributes" + >', + 'with multiline attributes', + ), + 'with multiline attributes in reverse order' => array( + '<meta + content="with multiline attributes in reverse order" + name="description" + >', + 'with multiline attributes in reverse order', + ), + 'with multiline attributes and another element' => array( + '<meta + name="description" + content="with multiline attributes" + > + <meta name="viewport" content="width=device-width, initial-scale=1">', + 'with multiline attributes', + ), + 'with multiline and other attributes' => array( + '<meta + first="first" + name="description" + third="third" + content="description with multiline and other attributes" + fifth="fifth" + >', + 'description with multiline and other attributes', + ), + + // Happy paths with HTML tags or entities in the description. + 'with HTML tags' => array( + '<meta name="description" content="<strong>Description</strong>: has <em>HTML</em> tags">', + '<strong>Description</strong>: has <em>HTML</em> tags', + ), + 'with content first and HTML tags' => array( + '<meta content="<strong>Description</strong>: has <em>HTML</em> tags" name="description">', + '<strong>Description</strong>: has <em>HTML</em> tags', + ), + 'with HTML tags and other attributes' => array( + '<meta first="first" name="description" third="third" content="<strong>Description</strong>: has <em>HTML</em> tags" fifth="fifth>', + '<strong>Description</strong>: has <em>HTML</em> tags', + ), + 'with HTML entities' => array( + '<meta name="description" content="The <strong>description</strong> meta & its attribute value"', + 'The <strong>description</strong> meta & its attribute value', + ), + + // Unhappy paths. + 'with empty content' => array( + '<meta name="description" content="">', + '', + ), + 'with empty name' => array( + '<meta name="" content="name is empty">', + '', + ), + 'without a name attribute' => array( + '<meta content="without a name attribute">', + '', + ), + 'without a content attribute' => array( + '<meta name="description">', + '', + ), + ); + } + + /** + * @dataProvider data_get_image + */ + public function test_get_image( $html, $expected, $target_url = 'https://wordpress.org' ) { $controller = new WP_REST_URL_Details_Controller(); - $method = $this->get_reflective_method( 'get_image' ); - $result = $method->invoke( + + // Parse the meta elements from the given HTML. + $method = $this->get_reflective_method( 'get_meta_with_content_elements' ); + $meta_elements = $method->invoke( $controller, - $this->wrap_html_in_doc( $html ), - $target_url + $this->wrap_html_in_doc( $html ) ); - $this->assertEquals( $expected_image, $result ); + + $method = $this->get_reflective_method( 'get_image' ); + $actual = $method->invoke( $controller, $meta_elements, $target_url ); + $this->assertEquals( $expected, $actual ); } - public function provide_get_image_data() { + public function data_get_image() { return array( - 'default' => array( - '<meta property="og:image" content="https://wordpress.org/images/myimage.jpg" />', + + // Happy paths. + 'default' => array( + '<meta property="og:image" content="https://wordpress.org/images/myimage.jpg">', 'https://wordpress.org/images/myimage.jpg', ), - 'no_closing_tag' => array( - '<meta property="og:image" content="https://wordpress.org/images/myimage.jpg">', + 'with whitespace' => array( + '<meta property=" og:image " content=" https://wordpress.org/images/myimage.jpg " >', 'https://wordpress.org/images/myimage.jpg', ), - 'using_url_modifier' => array( - '<meta property="og:image:url" content="https://wordpress.org/images/myimage.jpg" /> - <meta property="og:image:alt" content="Ignore this please" />', + 'with self-closing' => array( + '<meta property="og:image" content="https://wordpress.org/images/myimage.jpg"/>', 'https://wordpress.org/images/myimage.jpg', ), - 'should_ignore_other_modifiers' => array( + 'with self-closing and whitespace' => array( + '<meta property=" og:image " content=" https://wordpress.org/images/myimage.jpg " />', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with single quotes' => array( + "<meta property='og:image' content='https://wordpress.org/images/myimage.jpg'>", + 'https://wordpress.org/images/myimage.jpg', + ), + 'without quotes' => array( + '<meta property=og:image content="https://wordpress.org/images/myimage.jpg">', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with url modifier' => array( + '<meta property="og:image:url" content="https://wordpress.org/images/url-modifier.jpg" /> + <meta property="og:image" content="https://wordpress.org/images/myimage.jpg">', + 'https://wordpress.org/images/url-modifier.jpg', + ), + 'with query string' => array( + '<meta property="og:image" content="https://wordpress.org/images/withquerystring.jpg?foo=bar&bar=foo" />', + 'https://wordpress.org/images/withquerystring.jpg?foo=bar&bar=foo', + ), + + // Happy paths with changing attributes order or adding attributes. + 'with content first' => array( + '<meta content="https://wordpress.org/images/myimage.jpg" property="og:image">', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with other attributes' => array( + '<meta first="first" property="og:image" third="third" content="https://wordpress.org/images/myimage.jpg" fifth="fifth">', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with other og meta' => array( '<meta property="og:image:height" content="720" /> + <meta property="og:image:alt" content="Ignore this please" /> <meta property="og:image" content="https://wordpress.org/images/myimage.jpg" /> - <meta property="og:image:alt" content="Ignore this please" />', + <link rel="stylesheet" href="https://example.com/assets/style.css" />', 'https://wordpress.org/images/myimage.jpg', ), - 'with_query_string' => array( - '<meta property="og:image" content="https://wordpress.org/images/myimage.jpg?foo=bar&bar=foo" />', - 'https://wordpress.org/images/myimage.jpg?foo=bar&bar=foo', - ), - 'relative_url' => array( + + // Happy paths with relative url. + 'with relative url' => array( '<meta property="og:image" content="/images/myimage.jpg" />', 'https://wordpress.org/images/myimage.jpg', ), - 'relative_url_no_slash' => array( + 'with relative url without starting slash' => array( '<meta property="og:image" content="images/myimage.jpg" />', 'https://wordpress.org/images/myimage.jpg', ), - 'relative_url_with_path' => array( + 'with relative url and path' => array( '<meta property="og:image" content="images/myimage.jpg" />', 'https://wordpress.org/images/myimage.jpg', 'https://wordpress.org/my/path/here/', ), + + // Happy paths with multiline attributes. + 'with multiline attributes' => array( + '<meta + property="og:image" + content="https://wordpress.org/images/myimage.jpg" + >', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with multiline attributes in reverse order' => array( + '<meta + content="https://wordpress.org/images/myimage.jpg" + property="og:image" + >', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with multiline attributes and other elements' => array( + '<meta + property="og:image:height" + content="720" + /> + <meta + property="og:image:alt" + content="Ignore this please" + /> + <meta + property="og:image" + content="https://wordpress.org/images/myimage.jpg" + > + <link rel="stylesheet" href="https://example.com/assets/style.css" />', + 'https://wordpress.org/images/myimage.jpg', + ), + 'with multiline and other attributes' => array( + '<meta + first="first" + property="og:image:url" + third="third" + content="https://wordpress.org/images/myimage.jpg" + fifth="fifth" + >', + 'https://wordpress.org/images/myimage.jpg', + ), + + // Happy paths with HTML tags in the content. + 'with other og meta' => array( + '<meta property="og:image:height" content="720" /> + <meta property="og:image:alt" content="<em>ignore this please</em>" /> + <meta property="og:image" content="https://wordpress.org/images/myimage.jpg" /> + <link rel="stylesheet" href="https://example.com/assets/style.css" />', + 'https://wordpress.org/images/myimage.jpg', + ), + + // Unhappy paths. + 'with empty content' => array( + '<meta property="og:image" content="">', + '', + ), + 'without a property attribute' => array( + '<meta content="https://wordpress.org/images/myimage.jpg">', + '', + ), + 'without a content attribute empty property' => array( + '<meta property="og:image" href="https://wordpress.org/images/myimage.jpg">', + '', + ), ); } From 411291d5b7fb26c93b6d777a3f478a97797d0eee Mon Sep 17 00:00:00 2001 From: Tonya Mork <hello@hellofromtonya.com> Date: Tue, 25 May 2021 05:16:55 -0500 Subject: [PATCH 26/34] Improves PR 31763 for the URL Details Controller (#32162) * Code standards and consistency. * Removed unused data provider. * More formatting and standards. * Title: converts entities. * Fixes asserts: removes deprecated array subset, uses assertSame, and makes consistent. * Fixes method return signatures. * Remove HTML and convert non-HTML entities. * Removes type check from set_cache as data will be string type.. --- lib/class-wp-rest-url-details-controller.php | 69 +++-- ...ss-wp-rest-url-details-controller-test.php | 274 ++++++------------ 2 files changed, 131 insertions(+), 212 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index b221d391856288..0a345d7586fc60 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -57,7 +57,6 @@ public function register_routes() { * @return array the schema. */ public function get_item_schema() { - if ( $this->schema ) { return $this->add_additional_fields_schema( $this->schema ); } @@ -86,10 +85,9 @@ public function get_item_schema() { * response. * * @param WP_REST_REQUEST $request Full details about the request. - * @return WP_REST_Response|WP_Error The parsed details as a response object or an error. + * @return WP_REST_Response|WP_Error The parsed details as a response object, or an error. */ public function parse_url_details( $request ) { - $url = untrailingslashit( $request['url'] ); if ( empty( $url ) ) { @@ -143,12 +141,10 @@ public function parse_url_details( $request ) { return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response ); } - - /** * Checks whether a given request has permission to read remote urls. * - * @return WP_Error|bool True if the request has access, WP_Error object otherwise. + * @return WP_Error|bool True if the request has access, or WP_Error object. */ public function permissions_check() { if ( current_user_can( 'edit_posts' ) ) { @@ -168,16 +164,13 @@ public function permissions_check() { ); } - - /** * Retrieves the document title from a remote URL. * * @param string $url The website url whose HTML we want to access. - * @return array|WP_Error the HTTP response from the remote URL or error. + * @return array|WP_Error the HTTP response from the remote URL, or an error. */ private function get_remote_url( $url ) { - $args = array( 'limit_response_size' => 150 * KB_IN_BYTES, 'user-agent' => $this->get_random_user_agent(), @@ -216,14 +209,19 @@ private function get_remote_url( $url ) { * Parses the <title> contents from the provided HTML * * @param string $html The HTML from the remote website at URL. - * @return string The title tag contents on success; else empty string. + * @return string The title tag contents on success, or an empty string. */ private function get_title( $html ) { - preg_match( '|<title[^>]*>(.*?)<\s*/\s*title>|is', $html, $match_title ); + $pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is'; + preg_match( $pattern, $html, $match_title ); - $title = isset( $match_title[1] ) && is_string( $match_title[1] ) ? trim( $match_title[1] ) : ''; + $title = ! empty( $match_title[1] ) && is_string( $match_title[1] ) ? trim( $match_title[1] ) : ''; - return $title; + if ( empty( $title ) ) { + return ''; + } + + return $this->prepare_metadata_for_output( $title ); } /** @@ -231,7 +229,7 @@ private function get_title( $html ) { * * @param string $html The HTML from the remote website at URL. * @param string $url The target website URL. - * @return string The icon URI on success; else empty string. + * @return string The icon URI on success, or an empty string. */ private function get_icon( $html, $url ) { // Grab the icon's link element. @@ -268,7 +266,7 @@ private function get_icon( $html, $url ) { * @type string[] 1 Content attribute's opening quotation mark. * @type string[] 2 Content attribute's value for each meta element. * } - * @return string The meta description contents on success, else empty string. + * @return string The meta description contents on success, or an empty string. */ private function get_description( $meta_elements ) { // Bail out if there are no meta elements. @@ -283,10 +281,7 @@ private function get_description( $meta_elements ) { return ''; } - // Convert any entities to HTML for use downstream. - $description = html_entity_decode( $description, ENT_QUOTES, get_bloginfo( 'charset' ) ); - - return $description; + return $this->prepare_metadata_for_output( $description ); } /** @@ -320,11 +315,26 @@ private function get_image( $meta_elements, $url ) { return $image; } + /** + * Prepare the metadata by: + * + * - stripping all HTML tags and tag entities + * - converting non-tag entities into characters. + * + * @param string $metadata The metadata content to prepare. + * @return string The prepared metadata. + */ + private function prepare_metadata_for_output( $metadata ) { + $metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) ); + $metadata = wp_strip_all_tags( $metadata ); + return $metadata; + } + /** * Utility function to build cache key for a given URL. * - * @param string $url the URL for which to build a cache key. - * @return string the cache key. + * @param string $url The URL for which to build a cache key. + * @return string The cache key. */ private function build_cache_key_for_url( $url ) { return 'g_url_details_response_' . md5( $url ); @@ -333,8 +343,8 @@ private function build_cache_key_for_url( $url ) { /** * Utility function to retrieve a value from the cache at a given key. * - * @param string $key the cache key. - * @return string the value from the cache. + * @param string $key The cache key. + * @return mixed The value from the cache. */ private function get_cache( $key ) { return get_transient( $key ); @@ -343,15 +353,11 @@ private function get_cache( $key ) { /** * Utility function to cache a given data set at a given cache key. * - * @param string $key the cache key under which to store the value. - * @param string $data the data to be stored at the given cache key. - * @return void + * @param string $key The cache key under which to store the value. + * @param string $data The data to be stored at the given cache key. + * @return bool True when transient set, or false. */ private function set_cache( $key, $data = '' ) { - if ( ! is_array( $data ) ) { - return; - } - $ttl = HOUR_IN_SECONDS; /** @@ -377,7 +383,6 @@ private function set_cache( $key, $data = '' ) { * @return string the user agent string. */ private function get_random_user_agent() { - $agents = array( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246', // Windows 10-based PC using Edge browser. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9', // Mac OS X-based computer using a Safari browser. diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 02ba92e7b80ce6..5e08ca02a9e138 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -18,40 +18,13 @@ */ class WP_REST_URL_Details_Controller_Test extends WP_Test_REST_Controller_Testcase { - /** - * Admin user ID. - * - * @since x.x.0 - * - * @var int $subscriber_id - */ protected static $admin_id; - - /** - * Subscriber user ID. - * - * @since x.x.0 - * - * @var int $subscriber_id - */ protected static $subscriber_id; - - - protected static $route = '/__experimental/url-details'; - - + protected static $route = '/__experimental/url-details'; protected static $url_placeholder = 'https://placeholder-site.com'; + protected static $request_args = array(); - protected static $request_args = array(); - - /** - * Create fake data before our tests run. - * - * @since x.x.0 - * - * @param WP_UnitTest_Factory $factory Helper that lets us create fake data. - */ - public static function wpSetUpBeforeClass( $factory ) { + public static function wpSetUpBeforeClass( WP_UnitTest_Factory $factory ) { self::$admin_id = $factory->user->create( array( 'role' => 'administrator', @@ -69,11 +42,6 @@ public static function wpTearDownAfterClass() { self::delete_user( self::$subscriber_id ); } - - - /** - * Setup. - */ public function setUp() { parent::setUp(); @@ -81,29 +49,18 @@ public function setUp() { // Disables usage of cache during major of tests. $transient_name = 'g_url_details_response_' . md5( static::$url_placeholder ); - add_filter( - "pre_transient_$transient_name", - '__return_null' - ); + add_filter( "pre_transient_{$transient_name}", '__return_null' ); } - /** - * Tear down. - */ public function tearDown() { remove_filter( 'pre_http_request', array( $this, 'mock_success_request_to_remote_url' ), 10 ); $transient_name = 'g_url_details_response_' . md5( static::$url_placeholder ); - remove_filter( - "pre_transient_$transient_name", - '__return_null' - ); + remove_filter( "pre_transient_{$transient_name}", '__return_null' ); static::$request_args = array(); parent::tearDown(); } - - public function test_register_routes() { $routes = rest_get_server()->get_routes(); $this->assertArrayHasKey( static::$route, $routes ); @@ -124,11 +81,13 @@ public function test_get_items() { $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - // Note the data in the subset comes from the fixture HTML returned by - // the filter `pre_http_request` (see this class's `setUp` method). - $this->assertArraySubset( + /* + * Note the data in the subset comes from the fixture HTML returned by + * the filter `pre_http_request` (see this class's `setUp` method). + */ + $this->assertSame( array( - 'title' => 'Example Website — - with encoded content.', + 'title' => 'Example Website — - with encoded content.', 'icon' => 'https://placeholder-site.com/favicon.ico?querystringaddedfortesting', 'description' => 'Example description text here. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore.', 'image' => 'https://placeholder-site.com/images/home/screen-themes.png?3', @@ -137,7 +96,6 @@ public function test_get_items() { ); } - public function test_get_items_fails_for_unauthenticated_user() { wp_set_current_user( 0 ); @@ -150,12 +108,9 @@ public function test_get_items_fails_for_unauthenticated_user() { $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - $this->assertEquals( WP_Http::UNAUTHORIZED, $response->get_status() ); + $this->assertSame( WP_Http::UNAUTHORIZED, $response->get_status() ); - $this->assertEquals( - 'rest_cannot_view_url_details', - $data['code'] - ); + $this->assertSame( 'rest_cannot_view_url_details', $data['code'] ); $this->assertContains( strtolower( 'you are not allowed to process remote urls' ), @@ -175,12 +130,9 @@ public function test_get_items_fails_for_user_with_insufficient_permissions() { $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - $this->assertEquals( WP_Http::FORBIDDEN, $response->get_status() ); + $this->assertSame( WP_Http::FORBIDDEN, $response->get_status() ); - $this->assertEquals( - 'rest_cannot_view_url_details', - $data['code'] - ); + $this->assertSame( 'rest_cannot_view_url_details', $data['code'] ); $this->assertContains( strtolower( 'you are not allowed to process remote urls' ), @@ -189,10 +141,9 @@ public function test_get_items_fails_for_user_with_insufficient_permissions() { } /** - * @dataProvider provide_invalid_url_data + * @dataProvider data_invalid_url */ public function test_get_items_fails_for_invalid_url( $expected, $invalid_url ) { - wp_set_current_user( self::$admin_id ); $request = new WP_REST_Request( 'GET', static::$route ); @@ -204,12 +155,9 @@ public function test_get_items_fails_for_invalid_url( $expected, $invalid_url ) $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - $this->assertEquals( WP_Http::BAD_REQUEST, $response->get_status() ); + $this->assertSame( WP_Http::BAD_REQUEST, $response->get_status() ); - $this->assertEquals( - 'rest_invalid_param', - $data['code'] - ); + $this->assertSame( 'rest_invalid_param', $data['code'] ); $this->assertContains( strtolower( 'Invalid parameter(s): url' ), @@ -217,6 +165,23 @@ public function test_get_items_fails_for_invalid_url( $expected, $invalid_url ) ); } + public function data_invalid_url() { + return array( + 'empty_url' => array( + null, + '', + ), // empty! + 'not_a_string' => array( + null, + 1234456, + ), + 'string_but_invalid' => array( + null, + 'invalid.proto://wordpress.org', + ), + ); + } + public function test_get_items_fails_for_url_which_returns_a_non_200_status_code() { // Force HTTP request to remote site to fail. remove_filter( 'pre_http_request', array( $this, 'mock_success_request_to_remote_url' ), 10 ); @@ -233,12 +198,9 @@ public function test_get_items_fails_for_url_which_returns_a_non_200_status_code $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - $this->assertEquals( 404, $response->get_status() ); + $this->assertSame( 404, $response->get_status() ); - $this->assertEquals( - 'no_response', - $data['code'] - ); + $this->assertSame( 'no_response', $data['code'] ); $this->assertContains( strtolower( 'Not found' ), @@ -262,12 +224,9 @@ public function test_get_items_fails_for_url_which_returns_empty_body_for_succes $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - $this->assertEquals( 404, $response->get_status() ); + $this->assertSame( 404, $response->get_status() ); - $this->assertEquals( - 'no_content', - $data['code'] - ); + $this->assertSame( 'no_content', $data['code'] ); $this->assertContains( strtolower( 'Unable to retrieve body from response at this URL' ), @@ -303,7 +262,7 @@ function( $args, $url ) { rest_get_server()->dispatch( $request ); // Check the args were filtered as expected. - $this->assertArraySubset( + $this->assertContains( array( 'timeout' => 27, 'limit_response_size' => 153600, @@ -318,14 +277,11 @@ function( $args, $url ) { public function test_will_return_from_cache_if_populated() { $transient_name = 'g_url_details_response_' . md5( static::$url_placeholder ); - remove_filter( - "pre_transient_$transient_name", - '__return_null' - ); + remove_filter( "pre_transient_{$transient_name}", '__return_null' ); // Force cache to return a known value as the remote URL http response body. add_filter( - "pre_transient_$transient_name", + "pre_transient_{$transient_name}", function() { return '<html><head><title>This value from cache.'; } @@ -343,18 +299,12 @@ function() { $data = $response->get_data(); // Data should be that from cache not from mocked network response. - $this->assertContains( - 'This value from cache', - $data['title'] - ); + $this->assertContains( 'This value from cache', $data['title'] ); - remove_all_filters( - "pre_transient_$transient_name" - ); + remove_all_filters( "pre_transient_{$transient_name}" ); } public function test_allows_filtering_data_retrieved_for_a_given_url() { - add_filter( 'rest_prepare_url_details', function( $response ) { @@ -386,28 +336,21 @@ function( $response ) { $response = rest_get_server()->dispatch( $request ); $data = $response->get_data(); - // Instead of the default data retrieved we expect to see the modified - // data we provided via the filter. - $this->assertArraySubset( - array( - 'title' => 'Example Website — - with encoded content.', - 'og_title' => 'This was manually added to the data via filter', - ), - $data - ); + /* + * Instead of the default data retrieved we expect to see the modified + * data we provided via the filter. + */ + $this->assertSame( 'Example Website — - with encoded content.', $data['title'] ); + $this->assertSame( 'This was manually added to the data via filter', $data['og_title'] ); - remove_all_filters( - 'rest_prepare_url_details' - ); + remove_all_filters( 'rest_prepare_url_details' ); } - - - public function test_allows_filtering_response() { - - // Filter the response to known set of values changing only - // based on whether the response came from the cache or not. + /* + * Filter the response to known set of values changing only + * based on whether the response came from the cache or not. + */ add_filter( 'rest_prepare_url_details', function( $response, $url ) { @@ -435,23 +378,16 @@ function( $response, $url ) { $data = $response->get_data(); - $this->assertEquals( - '418', - $data['status'] - ); + $this->assertSame( 418, $data['status'] ); - $this->assertEquals( + $this->assertSame( 'Response for URL https://placeholder-site.com altered via rest_prepare_url_details filter', $data['response'] ); - remove_all_filters( - 'rest_prepare_url_details' - ); + remove_all_filters( 'rest_prepare_url_details' ); } - - public function test_get_item() { } @@ -481,7 +417,7 @@ public function test_get_item_schema() { $endpoint = $data['endpoints'][0]; $this->assertArrayHasKey( 'url', $endpoint['args'] ); - $this->assertArraySubset( + $this->assertContains( array( 'type' => 'string', 'required' => true, @@ -492,7 +428,7 @@ public function test_get_item_schema() { } /** - * @dataProvider provide_get_title_data + * @dataProvider data_get_title */ public function test_get_title( $html, $expected ) { $controller = new WP_REST_URL_Details_Controller(); @@ -505,32 +441,47 @@ public function test_get_title( $html, $expected ) { $this->assertSame( $expected, $actual ); } - - public function provide_get_title_data() { + public function data_get_title() { return array( - 'no attributes' => array( - 'Testing <title>:', - 'Testing <title>:', + + // Happy path for default. + 'default' => array( + 'Testing <title>', + 'Testing', ), 'with attributes' => array( - 'Testing <title>:', - 'Testing <title>:', + 'Testing <title>', + 'Testing', ), 'with text whitespace' => array( - ' Testing <title>: ', - 'Testing <title>:', - ), - 'when opening tag is malformed' => array( - '< title>Testing <title>: when opening tag is invalid', - '', + ' Testing <title> ', + 'Testing', ), 'with whitespace in opening tag' => array( 'Testing <title>: with whitespace in opening tag', - 'Testing <title>: with whitespace in opening tag', + 'Testing : with whitespace in opening tag', ), 'when whitepace in closing tag' => array( 'Testing <title>: with whitespace in closing tag</ title>', - 'Testing <title>: with whitespace in closing tag', + 'Testing : with whitespace in closing tag', + ), + 'with other elements' => array( + '<meta name="viewport" content="width=device-width"> + <title>Testing <title> + ', + 'Testing', + ), + 'multiline' => array( + ' + Testing <title> + ', + 'Testing', + ), + + // Unhappy paths. + 'when opening tag is malformed' => array( + '< title>Testing <title>: when opening tag is invalid', + '', ), ); } @@ -788,19 +739,19 @@ public function data_get_description() { // Happy paths with HTML tags or entities in the description. 'with HTML tags' => array( '', - 'Description: has HTML tags', + 'Description: has HTML tags', ), 'with content first and HTML tags' => array( '', - 'Description: has HTML tags', + 'Description: has HTML tags', ), 'with HTML tags and other attributes' => array( 'description meta & its attribute value', + 'The description meta & its attribute value', ), // Unhappy paths. @@ -978,42 +929,6 @@ public function data_get_image() { ); } - - - - - public function provide_invalid_url_data() { - return array( - 'empty_url' => array( - null, - '', - ), // empty! - 'not_a_string' => array( - null, - 1234456, - ), - 'string_but_invalid' => array( - null, - 'invalid.proto://wordpress.org', - ), - ); - } - - public function provide_response_is_from_cache() { - return array( - 'uncached_response' => array( - null, - false, - ), // empty! - 'cached_response' => array( - null, - true, - ), - ); - } - - - /** * Mocks the HTTP response for the the `wp_safe_remote_get()` which * would otherwise make a call to a real website. @@ -1033,7 +948,6 @@ public function mock_request_to_remote_url_with_empty_body_response( $response, } private function mock_request_to_remote_url( $result_type = 'success', $args ) { - static::$request_args = $args; $types = array( From 1a0f16d5d64907d63a37524738e95261592dfe4c Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 26 May 2021 15:32:02 +0100 Subject: [PATCH 27/34] Update lib/class-wp-rest-url-details-controller.php Co-authored-by: Tonya Mork --- lib/class-wp-rest-url-details-controller.php | 21 -------------------- 1 file changed, 21 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 0a345d7586fc60..4ee4f748988a62 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -373,27 +373,6 @@ private function set_cache( $key, $data = '' ) { return set_transient( $key, $data, $cache_expiration ); } - /** - * Picks a random user agent string from a list of common defaults. - * By default WordPress HTTP functions uses a semi-static string and - * this maybe rejected by many websites. - * - * See: https://core.trac.wordpress.org/browser/tags/5.7.1/src/wp-includes/class-http.php#L191. - * - * @return string the user agent string. - */ - private function get_random_user_agent() { - $agents = array( - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246', // Windows 10-based PC using Edge browser. - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9', // Mac OS X-based computer using a Safari browser. - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1', // Linux-based PC using a Firefox browser. - ); - - $chose = rand( 0, count( $agents ) - 1 ); - - return $agents[ $chose ]; - } - /** * Retrieves the `` section. * From b20b011dfbf400dd03d91059e7361ae086362ef8 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 26 May 2021 15:32:14 +0100 Subject: [PATCH 28/34] Update lib/class-wp-rest-url-details-controller.php Co-authored-by: Tonya Mork --- lib/class-wp-rest-url-details-controller.php | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 4ee4f748988a62..a6b771edcd2961 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -173,7 +173,6 @@ public function permissions_check() { private function get_remote_url( $url ) { $args = array( 'limit_response_size' => 150 * KB_IN_BYTES, - 'user-agent' => $this->get_random_user_agent(), ); /** From df7dd5a67813fead147ae4f41281a3d404b10f55 Mon Sep 17 00:00:00 2001 From: Dave Smith Date: Wed, 26 May 2021 15:32:23 +0100 Subject: [PATCH 29/34] Update lib/class-wp-rest-url-details-controller.php Co-authored-by: Tonya Mork --- lib/class-wp-rest-url-details-controller.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index a6b771edcd2961..afe97e0eca9e30 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -168,7 +168,7 @@ public function permissions_check() { * Retrieves the document title from a remote URL. * * @param string $url The website url whose HTML we want to access. - * @return array|WP_Error the HTTP response from the remote URL, or an error. + * @return string|WP_Error The HTTP response from the remote URL, or an error. */ private function get_remote_url( $url ) { $args = array( From 1618f199ed6355ded263752b30f77b6111e14db9 Mon Sep 17 00:00:00 2001 From: Tonya Mork Date: Thu, 27 May 2021 09:49:48 -0500 Subject: [PATCH 30/34] Icon: if data url, skip relative-to-absolute conversion (#32276) --- lib/class-wp-rest-url-details-controller.php | 6 ++++++ phpunit/class-wp-rest-url-details-controller-test.php | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index afe97e0eca9e30..ee9c0410f3b671 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -247,6 +247,12 @@ private function get_icon( $html, $url ) { return ''; } + // If the icon is a data URL, return it. + $parsed_icon = parse_url( $icon ); + if ( 'data' === $parsed_icon['scheme'] ) { + return $icon; + } + // Attempt to convert relative URLs to absolute. $parsed_url = parse_url( $url ); $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 5e08ca02a9e138..8b9ba9ccde04de 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -604,6 +604,14 @@ public function data_get_icon() { />', 'https://wordpress.org/favicon.ico', ), + 'with data URL x-icon type' => array( + '', + 'data:image/x-icon;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQEAYAAABPYyMiAAAABmJLR0T///////8JWPfcAAAACXBIWXMAAABIAAAASABGyWs+AAAAF0lEQVRIx2NgGAWjYBSMglEwCkbBSAcACBAAAeaR9cIAAAAASUVORK5CYII="', + ), + 'with data URL png type' => array( + '', + 'data:image/png;base64,iVBORw0KGgo=', + ), // Unhappy paths. 'empty rel' => array( From 4d33f54015ece4a80ee31030a51fc536061f3087 Mon Sep 17 00:00:00 2001 From: Tonya Mork Date: Thu, 27 May 2021 11:01:00 -0500 Subject: [PATCH 31/34] Fix failing test due to extra character in expected string. --- phpunit/class-wp-rest-url-details-controller-test.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phpunit/class-wp-rest-url-details-controller-test.php b/phpunit/class-wp-rest-url-details-controller-test.php index 8b9ba9ccde04de..c5d7fbbe40ad23 100644 --- a/phpunit/class-wp-rest-url-details-controller-test.php +++ b/phpunit/class-wp-rest-url-details-controller-test.php @@ -606,7 +606,7 @@ public function data_get_icon() { ), 'with data URL x-icon type' => array( '', - 'data:image/x-icon;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQEAYAAABPYyMiAAAABmJLR0T///////8JWPfcAAAACXBIWXMAAABIAAAASABGyWs+AAAAF0lEQVRIx2NgGAWjYBSMglEwCkbBSAcACBAAAeaR9cIAAAAASUVORK5CYII="', + 'data:image/x-icon;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQEAYAAABPYyMiAAAABmJLR0T///////8JWPfcAAAACXBIWXMAAABIAAAASABGyWs+AAAAF0lEQVRIx2NgGAWjYBSMglEwCkbBSAcACBAAAeaR9cIAAAAASUVORK5CYII=', ), 'with data URL png type' => array( '', From a0107f13124a760d0d276cf61b711986660a3a17 Mon Sep 17 00:00:00 2001 From: Tonya Mork Date: Fri, 28 May 2021 09:31:49 -0500 Subject: [PATCH 32/34] Updates schema for new data items. --- lib/class-wp-rest-url-details-controller.php | 22 ++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index ee9c0410f3b671..0a9dc74efd78dc 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -66,8 +66,26 @@ public function get_item_schema() { 'title' => 'url-details', 'type' => 'object', 'properties' => array( - 'title' => array( - 'description' => __( 'The contents of the tag from the URL.', 'gutenberg' ), + 'title' => array( + 'description' => __( 'The contents of the <title> element from the URL.', 'gutenberg' ), + 'type' => 'string', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + 'icon' => array( + 'description' => __( 'The favicon image link of the <link rel="icon"> element from the URL.', 'gutenberg' ), + 'type' => 'string', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + 'description' => array( + 'description' => __( 'The content of the <meta name="description"> element from the URL.', 'gutenberg' ), + 'type' => 'string', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + 'image' => array( + 'description' => __( 'The OG image link of the <meta property="og:image"> or <meta property="og:image:url"> element from the URL.', 'gutenberg' ), 'type' => 'string', 'context' => array( 'view', 'edit', 'embed' ), 'readonly' => true, From b013a82031c326bffe0b2676d2f21798b0eb2cfb Mon Sep 17 00:00:00 2001 From: Tonya Mork <hello@hellofromtonya.com> Date: Fri, 28 May 2021 10:37:25 -0500 Subject: [PATCH 33/34] Changes icon and image type to uri. --- lib/class-wp-rest-url-details-controller.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 0a9dc74efd78dc..707b6c82558071 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -74,7 +74,7 @@ public function get_item_schema() { ), 'icon' => array( 'description' => __( 'The favicon image link of the <link rel="icon"> element from the URL.', 'gutenberg' ), - 'type' => 'string', + 'type' => 'uri', 'context' => array( 'view', 'edit', 'embed' ), 'readonly' => true, ), @@ -86,7 +86,7 @@ public function get_item_schema() { ), 'image' => array( 'description' => __( 'The OG image link of the <meta property="og:image"> or <meta property="og:image:url"> element from the URL.', 'gutenberg' ), - 'type' => 'string', + 'type' => 'uri', 'context' => array( 'view', 'edit', 'embed' ), 'readonly' => true, ), From 0097a7407c4cf3dfec7d94f476b0a6c36844714c Mon Sep 17 00:00:00 2001 From: Tonya Mork <hello@hellofromtonya.com> Date: Fri, 28 May 2021 11:31:20 -0500 Subject: [PATCH 34/34] Schema: icon & image: reverts type back to string and adds format of uri. --- lib/class-wp-rest-url-details-controller.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/class-wp-rest-url-details-controller.php b/lib/class-wp-rest-url-details-controller.php index 707b6c82558071..ee7c1b23a05c71 100644 --- a/lib/class-wp-rest-url-details-controller.php +++ b/lib/class-wp-rest-url-details-controller.php @@ -74,7 +74,8 @@ public function get_item_schema() { ), 'icon' => array( 'description' => __( 'The favicon image link of the <link rel="icon"> element from the URL.', 'gutenberg' ), - 'type' => 'uri', + 'type' => 'string', + 'format' => 'uri', 'context' => array( 'view', 'edit', 'embed' ), 'readonly' => true, ), @@ -86,7 +87,8 @@ public function get_item_schema() { ), 'image' => array( 'description' => __( 'The OG image link of the <meta property="og:image"> or <meta property="og:image:url"> element from the URL.', 'gutenberg' ), - 'type' => 'uri', + 'type' => 'string', + 'format' => 'uri', 'context' => array( 'view', 'edit', 'embed' ), 'readonly' => true, ),