Skip to content

Commit

Permalink
Added washingtonpost.com scraper (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Aug 26, 2021
1 parent 8383a67 commit 3d97232
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 8 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: paperboy
Title: Comprehensive collection of news media scrapers
Version: 0.0.1.9000
Date: 2021-08-23
Date: 2021-08-26
Authors@R: person("Johannes", "Gruber", email = "[email protected]",
role = c("aut", "cre"))
Description: A comprehensive collection of webscraping scripts for news media sites.
Expand Down
8 changes: 8 additions & 0 deletions R/collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ pb_collect <- function(urls,
dplyr::rename(url = urls)
}

# see issue #3
if (any(out$domain == "www.washingtonpost.com")) {
if (any(grepl("gdpr-consent", out$expanded_url, fixed = TRUE))) {
warning("www.washingtonpost.com requests GDPR consent instead of showing the article.",
" See https://github.com/JBGruber/paperboy/issues/3")
}
}

if (verbose) {
if (any(out$status != 200L)) {
msg <- paste0(
Expand Down
84 changes: 84 additions & 0 deletions R/deliver_washingtonpost_com.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@

pb_deliver_paper.www_washingtonpost_com <- function(x, verbose = NULL, ...) {

. <- NULL

if (is.null(verbose)) verbose <- getOption("paperboy_verbose")

if (!"tbl_df" %in% class(x))
stop("Wrong object passed to internal deliver function: ", class(x))

if (verbose) message("\t...", nrow(x), " articles from ", x$domain[1])

pb <- make_pb(x)

purrr::map_df(seq_along(x$url), function(i) {
if (basename(x$expanded_url[i]) == x$domain[i]) {
tibble::tibble(
datetime = NA,
author = NA,
headline = NA,
text = NA
)
} else {

cont <- x$content_raw[i]
if (verbose) pb$tick()

html <- rvest::read_html(cont)

# datetime
suppressWarnings(
datetime <- html %>%
rvest::html_elements("[property=\"article:published_time\"],[itemprop*=\"datePublished\"],[name=\"ga-publishDate\"]") %>%
rvest::html_attr("content") %>%
lubridate::as_datetime()
)

if (length(datetime) < 1) {
datetime <- html %>%
rvest::html_elements("[class*=\"date\"]") %>%
rvest::html_text() %>%
strptime(format = "%B %d, %Y | %I:%M %p")
}


# headline
headline <- html %>%
rvest::html_elements("[property=\"og:title\"]") %>%
rvest::html_attr("content")

# author
author <- html %>%
rvest::html_elements("[data-qa=\"author-name\"],[class*=\"author-name \"]") %>%
rvest::html_text2() %>%
toString()

# text
text_temp <- html %>%
rvest::html_elements("[class=\"article-body\"]")

if (length(text_temp) > 0) {
text <- text_temp %>%
rvest::html_elements("p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")
} else {
text <- html %>%
rvest::html_elements("p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")
}

tibble::tibble(
datetime,
author,
headline,
text
)
}
}) %>%
cbind(x) %>%
normalise_df() %>%
return()
}
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ tibble::tribble(
"nytimes.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"nytimes.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"edition.cnn.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"washingtonpost.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
"washingtonpost.com", "![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)", "Johannes B. Gruber", "[#2](https://github.com/JBGruber/paperboy/issues/3)",
) %>%
arrange(domain) %>%
knitr::kable(escape = FALSE)
Expand Down
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,8 @@ therefore often encounter this warning:

``` r
pb_deliver("google.com")
#> Warning in pb_deliver_paper.default(u,
#> verbose = verbose, ...): No method for
#> www.google.com yet. Url ignored.
#> Warning in pb_deliver_paper.default(u, verbose = verbose, ...): No method for www.google.com
#> yet. Url ignored.
```

If you enter a vector of multiple URLs, the unsupported ones will be
Expand Down Expand Up @@ -98,7 +97,7 @@ it via a pull request.
| nytimes.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | |
| nytimes.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | |
| theguardian.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | |
| washingtonpost.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
| washingtonpost.com | ![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg) | Johannes B. Gruber | [#2](https://github.com/JBGruber/paperboy/issues/3) |
| wsj.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | |

- ![](https://img.shields.io/badge/status-gold-%23ffd700.svg): Runs
Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test-expand.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
test_that("expandurls", {
expect_equal({
res <- pb_collect(url = "https://httpbin.org/")
res <- pb_collect(urls = "https://httpbin.org/")
c(nrow(res), ncol(res))
}, c(1, 5))
expect_warning(
pb_collect(url = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE),
pb_collect(urls = "https://httpbin.org/delay/10", timeout = 1, ignore_fails = TRUE),
"1 job(s) did not finish before timeout. Think about increasing the timeout parameter. Enter ?pb_collect for help.",
fixed = TRUE
)
Expand Down

0 comments on commit 3d97232

Please sign in to comment.