diff --git a/DESCRIPTION b/DESCRIPTION index 7281c28..4eb66bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -37,7 +37,7 @@ Suggests: knitr, rmarkdown, dplyr, - testthat, + testthat (>= 3.0.0), covr, curl Depends: @@ -45,3 +45,4 @@ Depends: VignetteBuilder: knitr RoxygenNote: 7.2.3 Encoding: UTF-8 +Config/testthat/edition: 3 diff --git a/tests/testthat.R b/tests/testthat.R index f7b791a..7c368b8 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,13 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + library(testthat) -if(curl::has_internet()){ +library(robotstxt) +if (curl::has_internet()) { test_check("robotstxt") } diff --git a/tests/testthat/_snaps/http_event_handling.md b/tests/testthat/_snaps/http_event_handling.md new file mode 100644 index 0000000..3ac3ba5 --- /dev/null +++ b/tests/testthat/_snaps/http_event_handling.md @@ -0,0 +1,132 @@ +# non www redirects are handled non silently + + Code + domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", + package = "robotstxt")) + suppressMessages(get_robotstxt("http://google.com", rt_robotstxt_http_getter = function( + ...) { + domain_change + }, warn = TRUE)) + Condition + Warning in `request_handler_handler()`: + Event: on_file_type_mismatch + Warning in `request_handler_handler()`: + Event: on_suspect_content + Output + [robots.txt] + -------------------------------------- + + # robots.txt overwrite by: on_suspect_content + + User-agent: * + Allow: / + + + + [events] + -------------------------------------- + + requested: www.petermeissner.de + downloaded: https://petermeissner.de/ + + $on_redirect + $on_redirect[[1]] + $on_redirect[[1]]$status + [1] 301 + + $on_redirect[[1]]$location + [1] "https://www.petermeissner.de/" + + + $on_redirect[[2]] + $on_redirect[[2]]$status + [1] 301 + + $on_redirect[[2]]$location + [1] "https://petermeissner.de/" + + + $on_redirect[[3]] + $on_redirect[[3]]$status + [1] 200 + + $on_redirect[[3]]$location + NULL + + + + $on_file_type_mismatch + $on_file_type_mismatch$content_type + [1] "text/html" + + + $on_suspect_content + $on_suspect_content$parsable + [1] FALSE + + $on_suspect_content$content_suspect + [1] TRUE + + + [attributes] + -------------------------------------- + + problems, cached, request, class + +# client error + + Code + http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", + package = "robotstxt")) + suppressMessages(get_robotstxt("httpbin.org", rt_robotstxt_http_getter = function( + ...) { + http_client_error + })) + Condition + Warning in `request_handler_handler()`: + Event: on_client_error + Warning in `request_handler_handler()`: + Event: on_file_type_mismatch + Output + [robots.txt] + -------------------------------------- + + # robots.txt overwrite by: on_client_error + + User-agent: * + Allow: / + + + + [events] + -------------------------------------- + + requested: https://httpbin.org/status/400 + downloaded: https://httpbin.org/status/400 + + $on_client_error + $on_client_error$status_code + [1] 400 + + + $on_file_type_mismatch + $on_file_type_mismatch$content_type + [1] "text/html; charset=utf-8" + + + [attributes] + -------------------------------------- + + problems, cached, request, class + +# server error + + Code + res <- suppressMessages(get_robotstxt("httpbin.org", rt_robotstxt_http_getter = f, + on_server_error = list(signal = "warning"), force = TRUE)) + Condition + Warning in `request_handler_handler()`: + Event: on_server_error + Warning in `request_handler_handler()`: + Event: on_file_type_mismatch + diff --git a/tests/testthat/_snaps/paths_allowed.md b/tests/testthat/_snaps/paths_allowed.md new file mode 100644 index 0000000..cfe2fd7 --- /dev/null +++ b/tests/testthat/_snaps/paths_allowed.md @@ -0,0 +1,31 @@ +# paths_allowed() works also with 'downloaded' robots.txt files + + Code + domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", + package = "robotstxt")) + suppressMessages(paths_allowed(paths = "https://github.io/index.html", + rt_robotstxt_http_getter = function(...) { + domain_change + }, warn = FALSE)) + Output + [1] TRUE + +--- + + Code + domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", + package = "robotstxt")) + suppressMessages(paths_allowed(paths = "https://github.io/index.html", + rt_robotstxt_http_getter = function(...) { + domain_change + })) + Condition + Warning in `request_handler_handler()`: + Event: on_domain_change + Warning in `request_handler_handler()`: + Event: on_file_type_mismatch + Warning in `request_handler_handler()`: + Event: on_suspect_content + Output + [1] TRUE + diff --git a/tests/testthat/test_attribute_handling.R b/tests/testthat/test_attribute_handling.R index 087cc98..db6d4f8 100644 --- a/tests/testthat/test_attribute_handling.R +++ b/tests/testthat/test_attribute_handling.R @@ -1,15 +1,15 @@ -context("attribute handling") - test_that("get_robotstxt produces attributes", { expect_true({ www_redirect <- readRDS(system.file("http_requests/http_redirect_www.rds", package = "robotstxt")) - suppressWarnings( - rtxt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){www_redirect} + suppressMessages( + suppressWarnings( + rtxt <- + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){www_redirect} + ) ) ) @@ -19,12 +19,14 @@ test_that("get_robotstxt produces attributes", { expect_true({ http_404 <- readRDS(system.file("http_requests/http_404.rds", package = "robotstxt")) - suppressWarnings( - rtxt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){http_404} - ) + suppressMessages( + suppressWarnings( + rtxt <- + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){http_404} + ) + ) ) "problems" %in% names(attributes(rtxt)) @@ -34,12 +36,14 @@ test_that("get_robotstxt produces attributes", { expect_true({ http_ok <- readRDS(system.file("http_requests/http_ok_1.rds", package = "robotstxt")) - suppressWarnings( - rtxt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){http_404} - ) + suppressMessages( + suppressWarnings( + rtxt <- + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){http_404} + ) + ) ) "problems" %in% names(attributes(rtxt)) @@ -49,28 +53,31 @@ test_that("get_robotstxt produces attributes", { expect_true({ http_ok <- readRDS(system.file("http_requests/http_ok_2.rds", package = "robotstxt")) - suppressWarnings( - rtxt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){http_404} - ) + suppressMessages( + suppressWarnings( + rtxt <- + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){http_404} + ) + ) ) "problems" %in% names(attributes(rtxt)) }) - expect_true({ http_ok <- readRDS(system.file("http_requests/http_ok_3.rds", package = "robotstxt")) - suppressWarnings( - rtxt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){http_404} - ) + suppressMessages( + suppressWarnings( + rtxt <- + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){http_404} + ) + ) ) "problems" %in% names(attributes(rtxt)) @@ -80,20 +87,17 @@ test_that("get_robotstxt produces attributes", { expect_true({ http_ok <- readRDS(system.file("http_requests/http_ok_4.rds", package = "robotstxt")) - suppressWarnings( - rtxt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){http_404} - ) + suppressMessages( + suppressWarnings( + rtxt <- + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){http_404} + ) + ) ) "problems" %in% names(attributes(rtxt)) }) - - }) - - - diff --git a/tests/testthat/test_get_robotstxt.R b/tests/testthat/test_get_robotstxt.R index 2534751..854b825 100644 --- a/tests/testthat/test_get_robotstxt.R +++ b/tests/testthat/test_get_robotstxt.R @@ -1,8 +1,3 @@ -# testing the workings of get_robotstxt function - -context("get_robotstxt()") - - test_that( "NA in NA out", { expect_true({ @@ -12,7 +7,7 @@ test_that( expect_true({ all( is.na( - get_robotstxts(domain = c(NA, NA)) + suppressMessages(get_robotstxts(domain = c(NA, NA))) ) ) }) @@ -29,9 +24,13 @@ test_that( }) expect_true({ - suppressWarnings(get_robotstxts(domain = c("example.com", "example.com"))) + suppressMessages( + suppressWarnings( + get_robotstxts(domain = c("example.com", "example.com")) + ) + ) TRUE }) } } -) \ No newline at end of file +) diff --git a/tests/testthat/test_http_event_handling.R b/tests/testthat/test_http_event_handling.R index 1ebf99f..c47820a 100644 --- a/tests/testthat/test_http_event_handling.R +++ b/tests/testthat/test_http_event_handling.R @@ -1,5 +1,3 @@ -context("HTTP evenet handling") - test_that("www redirects are handled silently", { expect_true({ request <- readRDS(system.file("http_requests/http_redirect_www.rds", package = "robotstxt")) @@ -20,11 +18,12 @@ test_that("on_redirect detected", { expect_true({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) rt <- - get_robotstxt( - "http://google.com", - rt_robotstxt_http_getter = function(...){domain_change}, - warn = FALSE - ) + suppressMessages( + get_robotstxt( + "http://google.com", + rt_robotstxt_http_getter = function(...){domain_change}, + warn = FALSE + )) "on_redirect" %in% names(attr(rt, "problems")) }) }) @@ -33,36 +32,39 @@ test_that("on_domain_change_detected", { expect_true({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) rt <- - get_robotstxt( + suppressMessages( + get_robotstxt( "github.io", rt_robotstxt_http_getter = function(...){domain_change}, warn = FALSE - ) + )) "on_domain_change" %in% names(attr(rt, "problems")) }) }) - test_that("non www redirects are handled non silently", { - expect_warning({ + expect_snapshot({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) - get_robotstxt( + suppressMessages( + get_robotstxt( "http://google.com", rt_robotstxt_http_getter = function(...){domain_change}, warn = TRUE - ) + )) }) }) + test_that("warn = FALSE does silences warnings", { expect_silent({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) - get_robotstxt( + suppressMessages( + get_robotstxt( "github.io", rt_robotstxt_http_getter = function(...){domain_change}, warn = FALSE - ) + )) }) }) @@ -72,10 +74,11 @@ test_that("suspect content", { suppressWarnings({ suspect_content <- readRDS(system.file("http_requests/http_html_content.rds", package = "robotstxt")) rtxt <- - get_robotstxt( + suppressMessages( + get_robotstxt( "pages.github.com", rt_robotstxt_http_getter = function(...){suspect_content} - ) + )) problems <- attr(rtxt, "problems") }) @@ -84,73 +87,75 @@ test_that("suspect content", { }) - - - test_that("all ok", { expect_silent({ http_ok <- readRDS(system.file("http_requests/http_ok_1.rds", package = "robotstxt")) - get_robotstxt( + suppressMessages(get_robotstxt( "google.com", rt_robotstxt_http_getter = function(...){http_ok} - ) + )) }) expect_silent({ http_ok <- readRDS(system.file("http_requests/http_ok_2.rds", package = "robotstxt")) - get_robotstxt( + suppressMessages( + get_robotstxt( "google.com", rt_robotstxt_http_getter = function(...){http_ok} - ) + )) }) expect_silent({ http_ok <- readRDS(system.file("http_requests/http_ok_3.rds", package = "robotstxt")) - get_robotstxt( + suppressMessages( + get_robotstxt( "google.com", rt_robotstxt_http_getter = function(...){http_ok} - ) + )) }) expect_silent({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - get_robotstxt( + suppressMessages( + get_robotstxt( "google.com" - ) + )) } }) expect_silent({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - get_robotstxt( + suppressMessages( + get_robotstxt( "google.com", force = TRUE - ) + )) } }) }) - - test_that("client error", { - expect_warning({ + expect_snapshot({ http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", package = "robotstxt")) - get_robotstxt( - "httpbin.org", - rt_robotstxt_http_getter = function(...){http_client_error} + suppressMessages( + get_robotstxt( + "httpbin.org", + rt_robotstxt_http_getter = function(...){http_client_error} + ) ) }) expect_true({ http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", package = "robotstxt")) res <- - get_robotstxt( + suppressMessages( + get_robotstxt( "httpbin.org", rt_robotstxt_http_getter = function(...){http_client_error}, warn = FALSE - ) + )) problems <- attr(res, "problems") problems$on_client_error$status_code == 400 }) @@ -158,12 +163,13 @@ test_that("client error", { expect_true({ http_client_error <- readRDS(system.file("http_requests/http_client_error.rds", package = "robotstxt")) res <- - paths_allowed( + suppressMessages( + paths_allowed( paths = c("", "/", "here/I/stand/chopping/lops"), domain = "httpbin.org", rt_robotstxt_http_getter = function(...){http_client_error}, warn = FALSE - ) + )) all(res) }) }) @@ -175,42 +181,37 @@ test_that("server error", { expect_error({ rt <- - get_robotstxt( + suppressMessages( + get_robotstxt( "httpbin.org", rt_robotstxt_http_getter = f, warn = FALSE, force = TRUE - ) + )) }) - expect_warning({ + expect_snapshot({ res <- - get_robotstxt( + suppressMessages( + get_robotstxt( "httpbin.org", rt_robotstxt_http_getter = f, on_server_error = list(signal = "warning"), force = TRUE - ) + )) }) expect_true({ res <- - paths_allowed( + suppressMessages( + paths_allowed( paths = c("", "/", "here/I/stand/chopping/lops"), domain = "httpbin.org", rt_robotstxt_http_getter = f, on_server_error = list(signal = "nothing"), warn = FALSE, force = TRUE - ) + )) all(!res) }) }) - - - - - - - - diff --git a/tests/testthat/test_issue50.R b/tests/testthat/test_issue50.R index bceb0b4..84f39ba 100644 --- a/tests/testthat/test_issue50.R +++ b/tests/testthat/test_issue50.R @@ -1,12 +1,8 @@ - -context("robotstxt missing scheme") - - test_that( "robotstxt no scheme works", { expect_true({ - if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - paths_allowed("www.google.com") + if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ + suppressMessages(paths_allowed("www.google.com")) } else { TRUE } @@ -14,7 +10,7 @@ test_that( expect_true({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - paths_allowed("google.com") + suppressMessages(paths_allowed("google.com")) } else { TRUE } @@ -24,12 +20,11 @@ test_that( ) - test_that( "robotstxt scheme works", { expect_true({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - paths_allowed("https://google.com") + suppressMessages(paths_allowed("https://google.com")) } else { TRUE } @@ -37,7 +32,7 @@ test_that( expect_true({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - paths_allowed("https://www.google.com") + suppressMessages(paths_allowed("https://www.google.com")) } else { TRUE } @@ -45,7 +40,7 @@ test_that( expect_true({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - paths_allowed("http://google.com") + suppressMessages(paths_allowed("http://google.com")) } else { TRUE } @@ -53,13 +48,10 @@ test_that( expect_true({ if ( Sys.getenv("rpkg_use_internet_for_testing") == "TRUE" ){ - paths_allowed("http://www.google.com") + suppressMessages(paths_allowed("http://www.google.com")) } else { TRUE } }) } ) - - - diff --git a/tests/testthat/test_parser.R b/tests/testthat/test_parser.R index c7b8224..294526d 100644 --- a/tests/testthat/test_parser.R +++ b/tests/testthat/test_parser.R @@ -1,7 +1,3 @@ -# tests for functions responsible for data gathering and transformation - - - rtxt_asb <- rt_get_rtxt("allow_single_bot.txt") rtxt_dafa <- rt_get_rtxt("disallow_all_for_all.txt") rtxt_dafbb <- rt_get_rtxt("disallow_all_for_BadBot.txt") @@ -27,116 +23,43 @@ rtxt_rbloggers <- rt_get_rtxt("rbloggers.txt") rtxt_ct <- rt_get_rtxt("robots_commented_token.txt") +valid_rtxt_files <- c( + rtxt_asb, rtxt_dafa, rtxt_dafbb, rtxt_dsfa, rtxt_empty, + rtxt_datao, rtxt_tcom, rtxt_amzn, rtxt_bt, rtxt_ggl, + rtxt_nyt, rtxt_spgl, rtxt_yh, rtxt_she, rtxt_pm, + rtxt_wp, rtxt_cd, rtxt_host, rtxt_cdc, rtxt_ct, + "\n\n\n" +) +test_that("all robots.txt files are valid with check_strickt_ascii = F", { + expect_true(is_valid_robotstxt(valid_rtxt_files)) +}) -context("is_valid_robotstxt()") - - -test_that( - "all robots.txt files are valid", { - expect_true( - is_valid_robotstxt( rtxt_asb ) - ) - - expect_true( - is_valid_robotstxt( rtxt_dafa ) - ) - - expect_true( - is_valid_robotstxt( rtxt_dafbb ) - ) - - expect_true( - is_valid_robotstxt( rtxt_dsfa ) - ) - expect_true( - is_valid_robotstxt( rtxt_empty ) - ) - - expect_true( - is_valid_robotstxt( rtxt_datao ) - ) - - expect_true( - is_valid_robotstxt( rtxt_tcom ) - ) - - expect_true( - is_valid_robotstxt( rtxt_amzn ) - ) - - expect_true( - is_valid_robotstxt( rtxt_bt ) - ) - - expect_true( - is_valid_robotstxt( rtxt_ggl ) - ) - - expect_true( - is_valid_robotstxt( rtxt_nyt ) - ) - - expect_true( - is_valid_robotstxt( rtxt_spgl ) - ) - - expect_true( - is_valid_robotstxt( rtxt_yh ) - ) - - expect_true( - is_valid_robotstxt( rtxt_she ) - ) - - expect_true( - is_valid_robotstxt( rtxt_pm ) - ) - - expect_true( - is_valid_robotstxt( rtxt_wp ) - ) - - expect_true( - is_valid_robotstxt( rtxt_cd ) - ) - - expect_true( - is_valid_robotstxt( rtxt_host ) - ) - expect_true( - is_valid_robotstxt( - "\n\n\n" - ) - ) +valid_rtxt_files_ascii <- c( + rtxt_asb, rtxt_dafa, rtxt_dafbb, rtxt_dsfa, rtxt_empty, + rtxt_datao, rtxt_tcom, rtxt_amzn, rtxt_bt, rtxt_ggl, + rtxt_nyt, rtxt_spgl, rtxt_yh, rtxt_she, rtxt_pm, + rtxt_cd, rtxt_host, rtxt_cdc, rtxt_ct, + "\n\n\n" +) - expect_false( - is_valid_robotstxt( - " # dings\nbums\n dings" - ) - ) +test_that("all robots.txt files are valid with check_strickt_ascii = T", { + expect_true( + is_valid_robotstxt(valid_rtxt_files_ascii, check_strickt_ascii = TRUE) + ) +}) - expect_false( - is_valid_robotstxt( rtxt_fb_nsp ) - ) - expect_true( - is_valid_robotstxt( rtxt_cdc ) - ) +test_that("broken robots.txt files are invalid", { + expect_false(is_valid_robotstxt(rtxt_fb_nsp)) - expect_true( - is_valid_robotstxt( rtxt_ct ) + expect_false( + is_valid_robotstxt( + " # dings\nbums\n dings" ) - }) - - -test_that( - "broken robots.txt files are invalid", { - expect_false( is_valid_robotstxt( rtxt_fb_nsp )) - }) - - + ) +}) for (char in c(" ", "\t", "(", ")", "<", ">", "@", ",", ";", "<", ">", "/", "[", "]", "?", "=", "{", "}") ) { @@ -148,143 +71,40 @@ for (char in c(" ", "\t", "(", ")", "<", ">", "@", ",", ";", "<", ">", "/", "[", replacement = char ) - if ( is_valid_robotstxt(txt) ){ + if (is_valid_robotstxt(txt)) { cat("CHAR: ", "'", char,"'; ", sep = "") } - test_that( - "field name has no special character", - expect_false( is_valid_robotstxt(txt) ) - ) - + test_that("field name has no special character", { + expect_false(is_valid_robotstxt(txt)) + }) } - -test_that( - "field name has no special character", +test_that("field name has no special character", { expect_false( is_valid_robotstxt("extension\\field: some value", check_strickt_ascii = TRUE) ) -) +}) -test_that( - "field name has no special character", +test_that("field name has no special character", { expect_false( is_valid_robotstxt("Error in curl::curl_fetch_memory(url, handle = handle) : Could not resolve host: domain.tld", check_strickt_ascii = TRUE) ) -) +}) +test_that("broken robots.txt files are invalid", { + expect_false(is_valid_robotstxt(rtxt_fb_nsp, check_strickt_ascii = TRUE)) - - -test_that( - "all robots.txt files are valid", { - expect_true( - is_valid_robotstxt( rtxt_asb , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_dafa , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_dafbb , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_dsfa , check_strickt_ascii = TRUE) - ) - expect_true( - is_valid_robotstxt( rtxt_empty , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_datao , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_tcom , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_amzn , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_bt , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_ggl , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_nyt , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_spgl , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_yh , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_she , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_pm , check_strickt_ascii = TRUE) - ) - - # expect_true( - # is_valid_robotstxt( rtxt_wp , check_strickt_ascii = TRUE) - # ) - - expect_true( - is_valid_robotstxt( rtxt_cd , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_host , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( - "\n\n\n", check_strickt_ascii = TRUE - ) - ) - - expect_false( - is_valid_robotstxt( - " # dings\nbums\n dings", check_strickt_ascii = TRUE - ) - ) - - expect_false( - is_valid_robotstxt( rtxt_fb_nsp , check_strickt_ascii = TRUE) - ) - - expect_true( - is_valid_robotstxt( rtxt_cdc , check_strickt_ascii = TRUE) + expect_false( + is_valid_robotstxt( + " # dings\nbums\n dings", check_strickt_ascii = TRUE ) - }) - - -test_that( - "broken robots.txt files are invalid", { - expect_false( is_valid_robotstxt( rtxt_fb_nsp , check_strickt_ascii = TRUE)) - }) - - - - + ) +}) -context("useragent extraction") test_that( "all user agents are extracted", { @@ -304,34 +124,32 @@ test_that( } ) -context("permission extraction") test_that( "specification of more than one user agent gets interpreted right", { - expect_true( dim(parse_robotstxt(rtxt_datao )$permissions)[1]==2 ) - expect_true( all(parse_robotstxt(rtxt_datao )$permissions$value=="/private/") ) + expect_true( dim(parse_robotstxt(rtxt_datao )$permissions)[1]==2) + expect_true( all(parse_robotstxt(rtxt_datao )$permissions$value=="/private/")) } ) -context("non-useragent extraction") - test_that( "comments get extracted right", { - expect_true( dim(parse_robotstxt(rtxt_tcom )$comments)[1]==3 ) + expect_true(dim(parse_robotstxt(rtxt_tcom )$comments)[1]==3) } ) test_that( "craw-delay gets extracted", { - expect_true( parse_robotstxt(rtxt_host)$host$value=="www.whatever.com" ) + expect_true(parse_robotstxt(rtxt_host)$host$value=="www.whatever.com") } ) + test_that( "craw-delay gets extracted", { - expect_true( parse_robotstxt(rtxt_cd)$crawl_delay$value==10 ) + expect_true(parse_robotstxt(rtxt_cd)$crawl_delay$value==10) } ) @@ -340,6 +158,7 @@ classes <- function(x){ unlist(lapply(x, class)) } + test_that( "data.frames contain no factors", { expect_false( any( classes( parse_robotstxt(rtxt_datao)$useragents ) %in% "factor") ) @@ -357,8 +176,6 @@ test_that( ) -context("cdc gets parsed correctly") - test_that( "cdc gets parsed correctly", { expect_true( @@ -372,8 +189,6 @@ test_that( ) -context("can handle varIOUs cases for robots.txt fields") - test_that( "can handle varIOUs cases for robots.txt fields - issue #55", { expect_true({ @@ -384,14 +199,6 @@ test_that( ) - - - - - - -context("Commented-out tokens get parsed correctly") - test_that( "Commented-out tokens get ignored", { expect_true( @@ -399,18 +206,3 @@ test_that( ) } ) - - - - - - - - - - - - - - - diff --git a/tests/testthat/test_path_examples_from_rfc.R b/tests/testthat/test_path_examples_from_rfc.R index 2e8c8d4..cd2c6bc 100644 --- a/tests/testthat/test_path_examples_from_rfc.R +++ b/tests/testthat/test_path_examples_from_rfc.R @@ -1,8 +1,5 @@ # tests for functions responsible for data gathering and transformation - - - # This table illustrates some examples: # # Record Path URL path Matches @@ -27,8 +24,6 @@ # /~joe/index.html /%7Ejoe/index.html yes -context("paths_allowed()") - test_that( "simple check", { expect_true( @@ -45,8 +40,8 @@ test_that( # A fictional site may have the following URLs: # -# http://www.fict.org/ -# http://www.fict.org/index.html +# http://www.fict.org/ +# http://www.fict.org/index.html # http://www.fict.org/robots.txt # http://www.fict.org/server.html # http://www.fict.org/services/fast.html @@ -99,10 +94,3 @@ test_that( # http://www.fict.org/org/plans.html No Yes No # http://www.fict.org/%7Ejim/jim.html No Yes No # http://www.fict.org/%7Emak/mak.html No Yes Yes - - - - - - - diff --git a/tests/testthat/test_paths_allowed.R b/tests/testthat/test_paths_allowed.R index 539944b..5cafa19 100644 --- a/tests/testthat/test_paths_allowed.R +++ b/tests/testthat/test_paths_allowed.R @@ -1,7 +1,3 @@ -# tests for functions responsible for data gathering and transformation - - -# note: get rt_get_rtxt() with devtools::load_all() rtxt_asb <- rt_get_rtxt("allow_single_bot.txt") rtxt_dafa <- rt_get_rtxt("disallow_all_for_all.txt") rtxt_dafbb <- rt_get_rtxt("disallow_all_for_BadBot.txt") @@ -29,9 +25,6 @@ options_grid <- ) -#### context("checking works") ================================================= -context("paths_allowed()") - ## fails because of spiderbar for ( i in seq_len(nrow(options_grid)) ) { @@ -159,6 +152,7 @@ test_that( } ) + test_that( "check 'only single bot allowed'", { @@ -179,7 +173,6 @@ test_that( ) } - # expect_false(paths_allowed(permissions_asb, path="images")) for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -196,7 +189,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="/images")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -213,7 +206,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="/images/")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -230,7 +223,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="images/")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -247,7 +240,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="images/dings")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -265,7 +258,6 @@ test_that( ) } - # expect_false(paths_allowed(permissions_asb, path="*")) for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -282,8 +274,6 @@ test_that( ) } - # - # expect_false(paths_allowed(permissions_asb, path="images", bot="harald")) for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -300,7 +290,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="/images", bot="*")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -317,7 +307,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="/images/", "*er")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -334,7 +324,7 @@ test_that( } ) } - # expect_false(paths_allowed(permissions_asb, path="*", bot="erwin")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -351,8 +341,7 @@ test_that( } ) } - # - # expect_true(paths_allowed(permissions_asb, path="images", bot="Google")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -369,7 +358,7 @@ test_that( } ) } - # expect_true(paths_allowed(permissions_asb, path="/images", bot="Google")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -387,7 +376,6 @@ test_that( ) } - # expect_true(paths_allowed(permissions_asb, path="/images/", bot="Google")) for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -405,8 +393,6 @@ test_that( ) } - - # expect_true(paths_allowed(permissions_asb, path="images/", bot="Google")) for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -423,7 +409,7 @@ test_that( } ) } - # expect_true(paths_allowed(permissions_asb, path="images/dings", bot="Google")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -440,7 +426,7 @@ test_that( } ) } - # expect_true(paths_allowed(permissions_asb, path="*", bot="Google")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -460,10 +446,6 @@ test_that( } ) - -# test_that( -# "dissallow all for all", { -# expect_false(paths_allowed(permissions_dafa, path="", bot="mybot")) for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -480,7 +462,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafa, path="/imgages", bot="mybot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -497,7 +479,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafa, path="index.html", bot="mybot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -514,7 +496,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafa, path="*", bot="mybot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -531,8 +513,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# -# expect_false(paths_allowed(permissions_dafa, path="")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -549,7 +530,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafa, path="/imgages")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -565,7 +546,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafa, path="index.html")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -582,7 +563,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafa, path="*")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -598,13 +579,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# } -# ) -# -# -# test_that( -# "dissallow all for BadBot", { -# expect_false(paths_allowed(permissions_dafbb, path="", bot="BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -621,7 +596,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="/imgages", bot="BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -638,7 +613,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="index.html", bot="BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -655,7 +630,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="*", bot="BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -672,8 +647,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# -# expect_true(paths_allowed(permissions_dafbb, path="")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -689,7 +663,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_dafbb, path="/imgages")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -705,7 +679,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_dafbb, path="index.html")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -721,7 +695,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_dafbb, path="*")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -737,13 +711,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# } -# ) -# -# -# test_that( -# "case of Bot naME dOeS not matter", { -# expect_false(paths_allowed(permissions_dafbb, path="", bot="badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -760,7 +728,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="/imgages", bot="badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -777,7 +745,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="index.html", bot="badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -794,7 +762,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="*", bot="badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -811,8 +779,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# -# expect_false(paths_allowed(permissions_dafbb, path="", bot="Badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -829,7 +796,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="/imgages", bot="Badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -846,7 +813,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="index.html", bot="Badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -863,7 +830,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_false(paths_allowed(permissions_dafbb, path="*", bot="Badbot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -880,13 +847,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# } -# ) -# -# -# test_that( -# "empty file leads to all allowed for all", { -# expect_true(paths_allowed(permissions_empty, path="")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -902,7 +863,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_empty, path="/")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -918,7 +879,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_empty, path="/imgages")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -934,7 +895,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_empty, path="index.html")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -951,8 +912,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# -# expect_true(paths_allowed(permissions_empty, path="", bot = "BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -969,7 +929,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_empty, path="/", bot = "BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -986,7 +946,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_empty, path="/imgages", bot = "BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -1003,7 +963,7 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# expect_true(paths_allowed(permissions_empty, path="index.html", bot = "BadBot")) + for ( i in seq_len(nrow(options_grid)) ) { test_that( "simple check", { @@ -1020,97 +980,77 @@ for ( i in seq_len(nrow(options_grid)) ) { } ) } -# } -# ) -# -# - - - - - - test_that("paths_allowed() works also with 'downloaded' robots.txt files",{ - - expect_message({ + expect_snapshot({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) - paths_allowed( - paths = "https://github.io/index.html", - rt_robotstxt_http_getter = function(...){domain_change}, - warn = FALSE + suppressMessages( + paths_allowed( + paths = "https://github.io/index.html", + rt_robotstxt_http_getter = function(...){domain_change}, + warn = FALSE + ) ) }) - expect_warning({ + expect_snapshot({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) - paths_allowed( - paths = "https://github.io/index.html", - rt_robotstxt_http_getter = function(...){domain_change} + suppressMessages( + paths_allowed( + paths = "https://github.io/index.html", + rt_robotstxt_http_getter = function(...){domain_change} + ) ) }) expect_true({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) - paths_allowed( - paths = "https://github.io/index.html", - rt_robotstxt_http_getter = function(...){domain_change}, - warn = FALSE + suppressMessages( + paths_allowed( + paths = "https://github.io/index.html", + rt_robotstxt_http_getter = function(...){domain_change}, + warn = FALSE + ) ) }) expect_true({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) res <- - paths_allowed( - paths = c("index.html", "dings/bums/trallalla"), - domain = "github.io", - rt_robotstxt_http_getter = function(...){domain_change}, - warn = FALSE - ) + suppressMessages( + paths_allowed( + paths = c("index.html", "dings/bums/trallalla"), + domain = "github.io", + rt_robotstxt_http_getter = function(...){domain_change}, + warn = FALSE + ) + ) all(res) }) expect_true({ domain_change <- readRDS(system.file("http_requests/http_domain_change.rds", package = "robotstxt")) res <- - paths_allowed( - paths = c("https://github.io/index.html", "https://github.io/index.html"), - rt_robotstxt_http_getter = function(...){domain_change}, - warn = FALSE - ) + suppressMessages( + paths_allowed( + paths = c("https://github.io/index.html", "https://github.io/index.html"), + rt_robotstxt_http_getter = function(...){domain_change}, + warn = FALSE + ) + ) all(res) }) - expect_true({ http_ok <- readRDS(system.file("http_requests/http_ok_1.rds", package = "robotstxt")) res <- - paths_allowed( - paths = c("https://google.com/?", "https://google.com/search/about"), - rt_robotstxt_http_getter = function(...){http_ok} + suppressMessages( + paths_allowed( + paths = c("https://google.com/?", "https://google.com/search/about"), + rt_robotstxt_http_getter = function(...){http_ok} + ) ) all(res == c(FALSE, TRUE)) }) }) - - - - - - - - - - - - - - - - - - - - diff --git a/tests/testthat/test_robotstxt.R b/tests/testthat/test_robotstxt.R index 243f906..35b1ebd 100644 --- a/tests/testthat/test_robotstxt.R +++ b/tests/testthat/test_robotstxt.R @@ -1,6 +1,3 @@ -# testing the workings of robotstxt objects - - rtxt_asb <- rt_get_rtxt("allow_single_bot.txt") rtxt_dafa <- rt_get_rtxt("disallow_all_for_all.txt") rtxt_dafbb <- rt_get_rtxt("disallow_all_for_BadBot.txt") @@ -18,8 +15,6 @@ rtxt_she <- rt_get_rtxt("selfhtml_Example.txt") rtxt_pm <- rt_get_rtxt("robots_pmeissner.txt") rtxt_wp <- rt_get_rtxt("robots_wikipedia.txt") -context("robotstxt creation") - # test_that( # "get_robotstxt() can fetch a file", { # expect_true( @@ -39,6 +34,7 @@ test_that( } ) + test_that( "robotstxt check method works well", { expect_true( robotstxt(text=rtxt_she)$check() ) @@ -47,8 +43,6 @@ test_that( ) -context("robotstxt checking") - test_that( "robotstxt check method works well", { expect_true( robotstxt(text=rtxt_she)$check() ) @@ -57,8 +51,6 @@ test_that( ) -context("robotstxt parsing multi agent records without newline") - test_that( "robotstxt parsing multi agent records without newline", { expect_true({ @@ -115,4 +107,3 @@ Disallow: / }) } ) - diff --git a/tests/testthat/test_tools.R b/tests/testthat/test_tools.R index abb3d11..9133f08 100644 --- a/tests/testthat/test_tools.R +++ b/tests/testthat/test_tools.R @@ -1,6 +1,3 @@ -# testing the workings of robotstxt objects - - rtxt_asb <- rt_get_rtxt("allow_single_bot.txt") rtxt_dafa <- rt_get_rtxt("disallow_all_for_all.txt") rtxt_dafbb <- rt_get_rtxt("disallow_all_for_BadBot.txt") @@ -25,8 +22,6 @@ rtxt_list <- rtxt_yh, rtxt_she, rtxt_pm, rtxt_wp ) -context("robotstxt print") - test_that( "robotstxt print works", { @@ -59,10 +54,6 @@ test_that( ) - -context("robotstxt tools") - - test_that( "robotstxt tools work", { @@ -77,9 +68,6 @@ test_that( rt_get_rtxt("robots_wikipedia.txt") rt_get_rtxt() }) - - - } ) @@ -109,5 +97,3 @@ test_that( } ) - -