Skip to content

Commit

Permalink
test: link-check: misc fixes (#1169)
Browse files Browse the repository at this point in the history
* test: fix link-check diffs

Fixes #1148

* test: link-check: fix file paths, add comments

* test: link-check: use git pathspec exclusions

* test: link-check: more path safety and comments

* test: link-check: misc tidy

* test: link-check: fix git diff multi-errors

* test: link-check: fix mac sed

* test: link-check: diff: whitelist rather than blacklist

As with `link-check-git-all.sh`, only include `md` & `js`
rather than include all except specified files

* link-check: add more file extensions

* link-check: add and sort exclusions

* link-check: add *.css

* link-check: re-exclude redirects-list.json
  • Loading branch information
casperdcl authored Apr 26, 2020
1 parent dbd4017 commit 4b2868c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 17 deletions.
16 changes: 11 additions & 5 deletions scripts/exclude-links.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
http://127.0.0.1:10000/devstoreaccount1;
http://localhost:3000/
http://localhost:8000/
http://millionsongdataset.com/pages/getting-dataset/
http://ogp.me/ns
http://s3-external-1.amazonaws.com/bucket/path
http://[email protected]/path
http://www.reddit.com/r/MachineLearning
https://$
http://s3-external-1.amazonaws.com/bucket/path
https://accounts.google.com/o/oauth2/auth
https://api.cloudflare.com/client/v4/zones/$
https://api.github.com/repos/$
Expand All @@ -16,11 +15,14 @@ https://code.dvc.org/foo/bar
https://data.dvc.org/foo/bar
https://dataversioncontrol.com/some-random
https://discuss.$
https://discuss.dvc.org/some-random
https://discuss.dataversioncontrol.com/some-random
https://discuss.dvc.org/some-random
https://drive.google.com/drive/folders/0AIac4JZqHhKmUk9PDA
https://dvc.org$
https://dvc.org/blog/$1
https://dvc.org/blog$1
https://dvc.org/blog/some-random
https://dvc.org/doc/command-reference$1
https://dvc.org/doc/command-reference/foo
https://dvc.org/foo
https://dvc.org/foo/bar?baz
Expand Down Expand Up @@ -49,17 +51,21 @@ https://remote.dvc.org/dataset-registry/a3/04af...
https://remote.dvc.org/dataset-registry/a3/04afb96060aad90176268345e10355
https://remote.dvc.org/foo/bar
https://remote.dvc.org/get-started
https://s3.eu.cloud-object-storage.appdomain.cloud
https://s3-us-east-2.amazonaws.com/dvc-public/$1/$2
https://s3-us-east-2.amazonaws.com/dvc-public/code/foo/bar
https://s3-us-east-2.amazonaws.com/dvc-public/data/foo/bar
https://s3-us-east-2.amazonaws.com/dvc-public/remote/foo/bar
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/$1
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/deb/foo
https://s3-us-east-2.amazonaws.com/dvc-s3-repo/rpm/foo
https://s3.eu.cloud-object-storage.appdomain.cloud
https://sweedom.us10.list-manage.com/subscribe/post?u=a08bf93caae4063c4e6a351f6&id=24c0ecc49a
https://www.dataversioncontrol.com/some-random
https://www.dvc.org/foo
https://www.kaggle.com/rtatman/kerneld4769833fe
https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/
https://www.reddit.com/r/MachineLearning/comments/bx0apm/d_how_do_you_manage_your_machine_learning/
https://www.youtube.com/embed/$
http://[email protected]/path
http://www.reddit.com/r/MachineLearning
11 changes: 9 additions & 2 deletions scripts/link-check-git-all.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail

repo="$(dirname "$(realpath "$(dirname "$0")")")"
pushd "$repo"

(find "$repo"/.github/ "$repo"/content/docs/ "$repo"/src/ -name '*.md' -o -name '*.js' && ls "$repo"/*.md "$repo"/*.js) \
| xargs -n1 -P8 $(dirname "$0")/link-check.sh
# can't do git ls-files since some may be untracked
(find .github/ content/docs/ src/ \
-name '*.css' -o -name '*.js' -o -name '*.jsx' -o -name '*.md' -o -name '*.tsx' -o \
-name '*.ts' -o -name '*.json' && ls *.js *.jsx *.md *.tsx *.ts *.json) \
| xargs -n1 -P8 "$(dirname "$0")"/link-check.sh

popd
15 changes: 11 additions & 4 deletions scripts/link-check-git-diff.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
#!/usr/bin/env bash
set -euo pipefail

exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname $0)/exclude-links.txt}"
repo="$(dirname "$(realpath "$(dirname "$0")")")"
pushd "$repo"

differ="git diff $(git merge-base HEAD origin/master)"
changed="$differ --name-only :^redirects-list.json"
changed="$($differ --name-only -- '*.css' '*.js' '*.jsx' '*.md' '*.tsx' '*.ts' '*.json' ':!redirects-list.json')"

[ -z "$changed" ] && exit 0

echo "$changed" | grep -v "$(basename "$exclude")" | while read -r file ; do
echo "$changed" | while read -r file ; do
# check whole file
# "$(dirname "$0")"/link-check.sh "$file"
# check just changed lines
echo -n "$file:"
$(dirname "$0")/link-check.sh <($differ -U0 -- "$file" | grep '^\+')
"$(dirname "$0")"/link-check.sh <($differ -U0 -- "$file" | grep '^\+')
done

popd
13 changes: 7 additions & 6 deletions scripts/link-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@
set -euo pipefail

base_url="${CHECK_LINKS_RELATIVE_URL:-https://dvc.org}"
exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname $0)/exclude-links.txt}"
[ -f "$exclude" ] && exclude="$(cat $exclude)"
exclude="${CHECK_LINKS_EXCLUDE_LIST:-$(dirname "$0")/exclude-links.txt}"
[ -f "$exclude" ] && exclude="$(cat "$exclude")"
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:74.0) Gecko/20100101 Firefox/74.0"


finder(){ # expects list of files
content="$(cat "$@")" # read once (could be file descriptors)
# explicit links not in markdown
pcregrep -o '(?<!\]\()https?://[^\s<>{}"'"'"'`]+' "$@"
echo "$content" | pcregrep -o '(?<!\]\()https?://[^\s<>{}"'"'"'`]+'
# explicit links in markdown
pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' "$@" | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))'
echo "$content" | pcregrep -o '(?<=\])\(https?://[^[\]\s]+\)' | pcregrep -o '\((?:[^)(]*(?R)?)*+\)' | pcregrep -o '(?<=\().*(?=\))'
# relative links in markdown
sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' "$@" | xargs -n1 -II echo ${base_url}I
echo "$content" | sed -nE 's/.*]\((\/[^)[:space:]]+).*/\1/p' | xargs -n1 -II echo ${base_url}I
# relative links in html
sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+?)["'"'"'].*/\1/p' "$@" | xargs -n1 -II echo ${base_url}I
echo "$content" | sed -nE 's/.*href=["'"'"'](\/[^"'"'"']+)["'"'"'].*/\1/p' | xargs -n1 -II echo ${base_url}I
}

checker(){ # expects list of urls
Expand Down

0 comments on commit 4b2868c

Please sign in to comment.