From c1826bd541df5975a50c8ec13e0213a15482a3f3 Mon Sep 17 00:00:00 2001 From: Filip Hracek Date: Fri, 16 Nov 2018 09:43:54 -0800 Subject: [PATCH] Fix checking of anchors with non-ASCII chars --- lib/src/destination.dart | 4 +++- lib/src/parsers/html.dart | 21 ++++++++++++++++++- test/case13/index.html | 10 +++++++++ .../with-non-percent-encoded-anchor.html | 10 +++++++++ test/e2e_test.dart | 10 +++++++++ 5 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 test/case13/index.html create mode 100644 test/case13/with-non-percent-encoded-anchor.html diff --git a/lib/src/destination.dart b/lib/src/destination.dart index 0a0e16d..bba5cc8 100644 --- a/lib/src/destination.dart +++ b/lib/src/destination.dart @@ -2,6 +2,8 @@ library linkcheck.destination; import 'dart:io' show ContentType, HttpClientResponse, RedirectInfo; +import 'package:linkcheck/src/parsers/html.dart'; + /// RegExp for detecting URI scheme, such as `http:`, `mailto:`, etc. final _scheme = new RegExp(r"$(\w[\w\-]*\w):"); @@ -222,7 +224,7 @@ class Destination { bool satisfiesFragment(String fragment) { if (fragment == null || fragment == '') return true; if (anchors == null) return false; - return anchors.contains(Uri.decodeComponent(fragment)); + return anchors.contains(normalizeAnchor(fragment)); } Map toMap() => { diff --git a/lib/src/parsers/html.dart b/lib/src/parsers/html.dart index d353706..7709b89 100644 --- a/lib/src/parsers/html.dart +++ b/lib/src/parsers/html.dart @@ -67,6 +67,25 @@ Link extractLink( return new Link(origin, destination, destinationUri.fragment); } +/// Takes an anchor (`id` or `name` attribute of an HTML element, or +/// a fragment of a link) and normalizes it. +/// +/// Anchors that can be percent-decoded, will. ("Hr%C3%A1%C4%8Dek" will +/// become "Hráček".) Others will be kept the same. ("Hráček" will stay +/// "Hráček".) +String normalizeAnchor(String anchor) { + String decoded; + try { + decoded = Uri.decodeComponent(anchor); + } on ArgumentError { + // TODO: Report or handle ids and attributes that are not + // percent-decodable (they were not percent-encoded and they + // contain an invalid character. + decoded = anchor; + } + return decoded; +} + FetchResults parseHtml(String content, Uri uri, Destination current, DestinationResult checked, bool ignoreLinks) { var doc = parse(content, generateSpans: true, sourceUrl: uri.toString()); @@ -79,7 +98,7 @@ FetchResults parseHtml(String content, Uri uri, Destination current, var anchors = doc .querySelectorAll("body [id], body [name]") .map((element) => element.attributes["id"] ?? element.attributes["name"]) - .map((fragment) => Uri.decodeComponent(fragment)) + .map(normalizeAnchor) .toList(); checked.anchors = anchors; diff --git a/test/case13/index.html b/test/case13/index.html new file mode 100644 index 0000000..ffe6e56 --- /dev/null +++ b/test/case13/index.html @@ -0,0 +1,10 @@ + + + + + + + + APL + + diff --git a/test/case13/with-non-percent-encoded-anchor.html b/test/case13/with-non-percent-encoded-anchor.html new file mode 100644 index 0000000..5be9ba4 --- /dev/null +++ b/test/case13/with-non-percent-encoded-anchor.html @@ -0,0 +1,10 @@ + + + + + + + +

RMS Berättar

+ + diff --git a/test/e2e_test.dart b/test/e2e_test.dart index 9d0354b..4916b9c 100644 --- a/test/e2e_test.dart +++ b/test/e2e_test.dart @@ -186,6 +186,16 @@ void main() { await server.destroy(); } }); + + test("fragment checking works with non-percent-encoded anchors", () async { + var server = await Dhttpd.start(path: getServingPath(13), port: port); + try { + int result = await run([":$port"], out); + expect(result, 0); + } finally { + await server.destroy(); + } + }); }, tags: ["integration"]); }