Added washingtonpost.com scraper (#1)

JBGruber · Aug 26, 2021 · 3d97232 · 3d97232
1 parent 8383a67
commit 3d97232
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 8 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: paperboy
 Title: Comprehensive collection of news media scrapers
 Version: 0.0.1.9000
-Date: 2021-08-23
+Date: 2021-08-26
 Authors@R: person("Johannes", "Gruber", email = "[email protected]",
   role = c("aut", "cre"))
 Description: A comprehensive collection of webscraping scripts for news media sites.

diff --git a/R/collect.R b/R/collect.R
@@ -94,6 +94,14 @@ pb_collect <- function(urls,
       dplyr::rename(url = urls)
   }
 
+  # see issue #3
+  if (any(out$domain == "www.washingtonpost.com")) {
+    if (any(grepl("gdpr-consent", out$expanded_url, fixed = TRUE))) {
+      warning("www.washingtonpost.com requests GDPR consent instead of showing the article.",
+              " See https://github.com/JBGruber/paperboy/issues/3")
+    }
+  }
+
   if (verbose) {
     if (any(out$status != 200L)) {
       msg <- paste0(

diff --git a/R/deliver_washingtonpost_com.R b/R/deliver_washingtonpost_com.R
@@ -0,0 +1,84 @@
+
+pb_deliver_paper.www_washingtonpost_com <- function(x, verbose = NULL, ...) {
+
+  . <- NULL
+
+  if (is.null(verbose)) verbose <- getOption("paperboy_verbose")
+
+  if (!"tbl_df" %in% class(x))
+    stop("Wrong object passed to internal deliver function: ", class(x))
+
+  if (verbose) message("\t...", nrow(x), " articles from ", x$domain[1])
+
+  pb <- make_pb(x)
+
+  purrr::map_df(seq_along(x$url), function(i) {
+    if (basename(x$expanded_url[i]) == x$domain[i]) {
+      tibble::tibble(
+        datetime  = NA,
+        author    = NA,
+        headline  = NA,
+        text      = NA
+      )
+    } else {
+
+      cont <- x$content_raw[i]
+      if (verbose) pb$tick()
+
+      html <- rvest::read_html(cont)
+
+      # datetime
+      suppressWarnings(
+        datetime <- html %>%
+          rvest::html_elements("[property=\"article:published_time\"],[itemprop*=\"datePublished\"],[name=\"ga-publishDate\"]") %>%
+          rvest::html_attr("content") %>%
+          lubridate::as_datetime()
+      )
+
+      if (length(datetime) < 1) {
+        datetime <- html %>%
+          rvest::html_elements("[class*=\"date\"]") %>%
+          rvest::html_text() %>%
+          strptime(format = "%B %d, %Y | %I:%M %p")
+      }
+
+
+      # headline
+      headline <- html %>%
+        rvest::html_elements("[property=\"og:title\"]") %>%
+        rvest::html_attr("content")
+
+      # author
+      author <- html %>%
+        rvest::html_elements("[data-qa=\"author-name\"],[class*=\"author-name \"]")  %>%
+        rvest::html_text2() %>%
+        toString()
+
+      # text
+      text_temp <- html %>%
+        rvest::html_elements("[class=\"article-body\"]")
+
+      if (length(text_temp) > 0) {
+        text <- text_temp %>%
+          rvest::html_elements("p") %>%
+          rvest::html_text2() %>%
+          paste(collapse = "\n")
+      } else {
+        text <- html %>%
+          rvest::html_elements("p") %>%
+          rvest::html_text2() %>%
+          paste(collapse = "\n")
+      }
+
+      tibble::tibble(
+        datetime,
+        author,
+        headline,
+        text
+      )
+    }
+  }) %>%
+    cbind(x) %>%
+    normalise_df() %>%
+    return()
+}
diff --git a/README.Rmd b/README.Rmd
@@ -125,7 +125,7 @@ tibble::tribble(
   "nytimes.com",          "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
   "nytimes.com",          "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
   "edition.cnn.com",      "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
-  "washingtonpost.com",   "![](https://img.shields.io/badge/status-broken-%23D8634C)",  "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
+  "washingtonpost.com",   "![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)",  "Johannes B. Gruber", "[#2](https://github.com/JBGruber/paperboy/issues/3)",
 ) %>% 
   arrange(domain) %>% 
   knitr::kable(escape = FALSE)

diff --git a/README.md b/README.md
@@ -59,9 +59,8 @@ therefore often encounter this warning:
 
 ``` r
 pb_deliver("google.com")
-#> Warning in pb_deliver_paper.default(u,
-#> verbose = verbose, ...): No method for
-#> www.google.com yet. Url ignored.
+#> Warning in pb_deliver_paper.default(u, verbose = verbose, ...): No method for www.google.com
+#> yet. Url ignored.
 ```
 
 If you enter a vector of multiple URLs, the unsupported ones will be
@@ -98,7 +97,7 @@ it via a pull request.
 | nytimes.com          | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg)   | Johannes B. Gruber |                                                      |
 | nytimes.com          | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg)   | Johannes B. Gruber |                                                      |
 | theguardian.com      | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg)   | Johannes B. Gruber |                                                      |
-| washingtonpost.com   | ![](https://img.shields.io/badge/status-broken-%23D8634C)     | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
+| washingtonpost.com   | ![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg) | Johannes B. Gruber | [#2](https://github.com/JBGruber/paperboy/issues/3) |
 | wsj.com              | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg)   | Johannes B. Gruber |                                                      |
 
 -   ![](https://img.shields.io/badge/status-gold-%23ffd700.svg): Runs

diff --git a/tests/testthat/test-expand.R b/tests/testthat/test-expand.R
@@ -1,10 +1,10 @@
 test_that("expandurls", {
   expect_equal({
-      res <- pb_collect(url = "https://httpbin.org/")
+      res <- pb_collect(urls = "https://httpbin.org/")
       c(nrow(res), ncol(res))
     }, c(1, 5))
   expect_warning(
-    pb_collect(url = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE),
+    pb_collect(urls = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE),
     "1 job(s) did not finish before timeout. Think about increasing the timeout parameter. Enter ?pb_collect for help.",
     fixed = TRUE
   )