Skip to content

Commit

Permalink
Added wsj.com scraper (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Aug 21, 2021
1 parent 3108a59 commit 15a0bbf
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 2 deletions.
54 changes: 54 additions & 0 deletions R/deliver_wsj_com.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@

pb_deliver_paper.www_wsj_com <- function(x, verbose = NULL, ...) {

. <- NULL

if (is.null(verbose)) verbose <- getOption("paperboy_verbose")

if (!"tbl_df" %in% class(x))
stop("Wrong object passed to internal deliver function: ", class(x))

if (verbose) message("\t...", nrow(x), " articles from ", x$domain[1])

pb <- make_pb(x)

y <- purrr::map_df(x$content_raw, function(cont) {

if (verbose) pb$tick()

html <- rvest::read_html(cont)

# datetime
datetime <- html %>%
rvest::html_elements("[name=\"article.published\"]") %>%
rvest::html_attr("content") %>%
lubridate::as_datetime()

# headline
headline <- html %>%
rvest::html_elements("title") %>%
rvest::html_text()

# author
author <- html %>%
rvest::html_elements("[name=\"author\"]") %>%
rvest::html_attr("content") %>%
toString()

# text
text <- html %>%
rvest::html_elements("p:not([id|=\"footer\"])") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

tibble::tibble(
datetime,
author,
headline,
text
)
}) %>%
cbind(x) %>%
normalise_df() %>%
return()
}
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ tibble::tribble(
"dailymail.co.uk", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"newsweek.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"marketwatch.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"wsj.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
"wsj.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "",
"nytimes.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
"nypost.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
"edition.cnn.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ it via a pull request.
| nytimes.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
| theguardian.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | |
| washingtonpost.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
| wsj.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
| wsj.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | |

- ![](https://img.shields.io/badge/status-gold-%23ffd700.svg): Runs
without known issues
Expand Down

0 comments on commit 15a0bbf

Please sign in to comment.