From 15a0bbfac403af69bec8b4fbd7e335003127c4e1 Mon Sep 17 00:00:00 2001 From: JBGruber Date: Sat, 21 Aug 2021 22:07:41 +0200 Subject: [PATCH] Added wsj.com scraper (#1) --- R/deliver_wsj_com.R | 54 +++++++++++++++++++++++++++++++++++++++++++++ README.Rmd | 2 +- README.md | 2 +- 3 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 R/deliver_wsj_com.R diff --git a/R/deliver_wsj_com.R b/R/deliver_wsj_com.R new file mode 100644 index 0000000..70ee5e6 --- /dev/null +++ b/R/deliver_wsj_com.R @@ -0,0 +1,54 @@ + +pb_deliver_paper.www_wsj_com <- function(x, verbose = NULL, ...) { + + . <- NULL + + if (is.null(verbose)) verbose <- getOption("paperboy_verbose") + + if (!"tbl_df" %in% class(x)) + stop("Wrong object passed to internal deliver function: ", class(x)) + + if (verbose) message("\t...", nrow(x), " articles from ", x$domain[1]) + + pb <- make_pb(x) + + y <- purrr::map_df(x$content_raw, function(cont) { + + if (verbose) pb$tick() + + html <- rvest::read_html(cont) + + # datetime + datetime <- html %>% + rvest::html_elements("[name=\"article.published\"]") %>% + rvest::html_attr("content") %>% + lubridate::as_datetime() + + # headline + headline <- html %>% + rvest::html_elements("title") %>% + rvest::html_text() + + # author + author <- html %>% + rvest::html_elements("[name=\"author\"]") %>% + rvest::html_attr("content") %>% + toString() + + # text + text <- html %>% + rvest::html_elements("p:not([id|=\"footer\"])") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + tibble::tibble( + datetime, + author, + headline, + text + ) + }) %>% + cbind(x) %>% + normalise_df() %>% + return() +} diff --git a/README.Rmd b/README.Rmd index 995472f..249ecf3 100644 --- a/README.Rmd +++ b/README.Rmd @@ -121,7 +121,7 @@ tibble::tribble( "dailymail.co.uk", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "", "newsweek.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "", "marketwatch.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "", - "wsj.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)", + "wsj.com", "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)", "Johannes B. Gruber", "", "nytimes.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)", "nypost.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)", "edition.cnn.com", "![](https://img.shields.io/badge/status-broken-%23D8634C)", "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)", diff --git a/README.md b/README.md index ec0b8ba..ae9487e 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ it via a pull request. | nytimes.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) | | theguardian.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | | | washingtonpost.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) | -| wsj.com | ![](https://img.shields.io/badge/status-broken-%23D8634C) | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) | +| wsj.com | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg) | Johannes B. Gruber | | - ![](https://img.shields.io/badge/status-gold-%23ffd700.svg): Runs without known issues