From 4b2868c3ef19817201c782f621041fb6e3abf8f5 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Sun, 26 Apr 2020 16:21:50 +0100 Subject: [PATCH] test: link-check: misc fixes (#1169) * test: fix link-check diffs Fixes #1148 * test: link-check: fix file paths, add comments * test: link-check: use git pathspec exclusions * test: link-check: more path safety and comments * test: link-check: misc tidy * test: link-check: fix git diff multi-errors * test: link-check: fix mac sed * test: link-check: diff: whitelist rather than blacklist As with `link-check-git-all.sh`, only include `md` & `js` rather than include all except specified files * link-check: add more file extensions * link-check: add and sort exclusions * link-check: add *.css * link-check: re-exclude redirects-list.json --- scripts/exclude-links.txt | 16 +++++++++++----- scripts/link-check-git-all.sh | 11 +++++++++-- scripts/link-check-git-diff.sh | 15 +++++++++++---- scripts/link-check.sh | 13 +++++++------ 4 files changed, 38 insertions(+), 17 deletions(-) diff --git a/scripts/exclude-links.txt b/scripts/exclude-links.txt index 4581fff123..4943c165bf 100644 --- a/scripts/exclude-links.txt +++ b/scripts/exclude-links.txt @@ -1,11 +1,10 @@ http://127.0.0.1:10000/devstoreaccount1; http://localhost:3000/ +http://localhost:8000/ http://millionsongdataset.com/pages/getting-dataset/ http://ogp.me/ns -http://s3-external-1.amazonaws.com/bucket/path -http://user@example.com/path -http://www.reddit.com/r/MachineLearning https://$ +http://s3-external-1.amazonaws.com/bucket/path https://accounts.google.com/o/oauth2/auth https://api.cloudflare.com/client/v4/zones/$ https://api.github.com/repos/$ @@ -16,11 +15,14 @@ https://code.dvc.org/foo/bar https://data.dvc.org/foo/bar https://dataversioncontrol.com/some-random https://discuss.$ -https://discuss.dvc.org/some-random https://discuss.dataversioncontrol.com/some-random +https://discuss.dvc.org/some-random https://drive.google.com/drive/folders/0AIac4JZqHhKmUk9PDA https://dvc.org$ +https://dvc.org/blog/$1 +https://dvc.org/blog$1 https://dvc.org/blog/some-random +https://dvc.org/doc/command-reference$1 https://dvc.org/doc/command-reference/foo https://dvc.org/foo https://dvc.org/foo/bar?baz @@ -49,13 +51,15 @@ https://remote.dvc.org/dataset-registry/a3/04af... https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355 https://remote.dvc.org/foo/bar https://remote.dvc.org/get-started +https://s3.eu.cloud-object-storage.appdomain.cloud +https://s3-us-east-2.amazonaws.com/dvc-public/$1/$2 https://s3-us-east-2.amazonaws.com/dvc-public/code/foo/bar https://s3-us-east-2.amazonaws.com/dvc-public/data/foo/bar https://s3-us-east-2.amazonaws.com/dvc-public/remote/foo/bar https://s3-us-east-2.amazonaws.com/dvc-s3-repo/ +https://s3-us-east-2.amazonaws.com/dvc-s3-repo/$1 https://s3-us-east-2.amazonaws.com/dvc-s3-repo/deb/foo https://s3-us-east-2.amazonaws.com/dvc-s3-repo/rpm/foo -https://s3.eu.cloud-object-storage.appdomain.cloud https://sweedom.us10.list-manage.com/subscribe/post?u=a08bf93caae4063c4e6a351f6&id=24c0ecc49a https://www.dataversioncontrol.com/some-random https://www.dvc.org/foo @@ -63,3 +67,5 @@ https://www.kaggle.com/rtatman/kerneld4769833fe https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/ https://www.reddit.com/r/MachineLearning/comments/bx0apm/d_how_do_you_manage_your_machine_learning/ https://www.youtube.com/embed/$ +http://user@example.com/path +http://www.reddit.com/r/MachineLearning diff --git a/scripts/link-check-git-all.sh b/scripts/link-check-git-all.sh index 27c78fea74..3505ac51d7 100755 --- a/scripts/link-check-git-all.sh +++ b/scripts/link-check-git-all.sh @@ -1,6 +1,13 @@ #!/usr/bin/env bash +set -euo pipefail repo="$(dirname "$(realpath "$(dirname "$0")")")" +pushd "$repo" -(find "$repo"/.github/ "$repo"/content/docs/ "$repo"/src/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \ - | xargs -n1 -P8 $(dirname "$0")/link-check.sh +# can't do git ls-files since some may be untracked +(find .github/ content/docs/ src/ \ + -name '*.css' -o -name '*.js' -o -name '*.jsx' -o -name '*.md' -o -name '*.tsx' -o \ + -name '*.ts' -o -name '*.json' && ls *.js *.jsx *.md *.tsx *.ts *.json) \ + | xargs -n1 -P8 "$(dirname "$0")"/link-check.sh + +popd diff --git a/scripts/link-check-git-diff.sh b/scripts/link-check-git-diff.sh index 1356a762ca..bbbef2960f 100755 --- a/scripts/link-check-git-diff.sh +++ b/scripts/link-check-git-diff.sh @@ -1,13 +1,20 @@ #!/usr/bin/env bash set -euo pipefail -exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname $0)/exclude-links.txt}" +repo="$(dirname "$(realpath "$(dirname "$0")")")" +pushd "$repo" + differ="git diff $(git merge-base HEAD origin/master)" -changed="$differ --name-only :^redirects-list.json" +changed="$($differ --name-only -- '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json')" [ -z "$changed" ] && exit 0 -echo "$changed" | grep -v "$(basename "$exclude")" | while read -r file ; do +echo "$changed" | while read -r file ; do + # check whole file + # "$(dirname "$0")"/link-check.sh "$file" + # check just changed lines echo -n "$file:" - $(dirname "$0")/link-check.sh <($differ -U0 -- "$file" | grep '^\+') + "$(dirname "$0")"/link-check.sh <($differ -U0 -- "$file" | grep '^\+') done + +popd diff --git a/scripts/link-check.sh b/scripts/link-check.sh index 6c4665e5e7..f10c20793f 100755 --- a/scripts/link-check.sh +++ b/scripts/link-check.sh @@ -7,20 +7,21 @@ set -euo pipefail base_url="${CHECK_LINKS_RELATIVE_URL:-https://dvc.org}" -exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname $0)/exclude-links.txt}" -[ -f "$exclude" ] && exclude="$(cat $exclude)" +exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname "$0")/exclude-links.txt}" +[ -f "$exclude" ] && exclude="$(cat "$exclude")" user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0" finder(){ # expects list of files + content="$(cat "$@")" # read once (could be file descriptors) # explicit links not in markdown - pcregrep -o '(?{}"'"'"'`]+' "$@" + echo "$content" | pcregrep -o '(?{}"'"'"'`]+' # explicit links in markdown - pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' "$@" | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))' + echo "$content" | pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))' # relative links in markdown - sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' "$@" | xargs -n1 -II echo ${base_url}I + echo "$content" | sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' | xargs -n1 -II echo ${base_url}I # relative links in html - sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+?)["'"'"'].*/\1/p' "$@" | xargs -n1 -II echo ${base_url}I + echo "$content" | sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+)["'"'"'].*/\1/p' | xargs -n1 -II echo ${base_url}I } checker(){ # expects list of urls