From 15a0bbfac403af69bec8b4fbd7e335003127c4e1 Mon Sep 17 00:00:00 2001
From: JBGruber <j.gruber.1@research.gla.ac.uk>
Date: Sat, 21 Aug 2021 22:07:41 +0200
Subject: [PATCH] Added wsj.com scraper (#1)

---
 R/deliver_wsj_com.R | 54 +++++++++++++++++++++++++++++++++++++++++++++
 README.Rmd          |  2 +-
 README.md           |  2 +-
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 R/deliver_wsj_com.R

diff --git a/R/deliver_wsj_com.R b/R/deliver_wsj_com.R
new file mode 100644
index 0000000..70ee5e6
--- /dev/null
+++ b/R/deliver_wsj_com.R
@@ -0,0 +1,54 @@
+
+pb_deliver_paper.www_wsj_com <- function(x, verbose = NULL, ...) {
+
+  . <- NULL
+
+  if (is.null(verbose)) verbose <- getOption("paperboy_verbose")
+
+  if (!"tbl_df" %in% class(x))
+    stop("Wrong object passed to internal deliver function: ", class(x))
+
+  if (verbose) message("\t...", nrow(x), " articles from ", x$domain[1])
+
+  pb <- make_pb(x)
+
+  y <- purrr::map_df(x$content_raw, function(cont) {
+
+    if (verbose) pb$tick()
+
+    html <- rvest::read_html(cont)
+
+    # datetime
+    datetime <- html %>%
+      rvest::html_elements("[name=\"article.published\"]") %>%
+      rvest::html_attr("content") %>%
+      lubridate::as_datetime()
+
+    # headline
+    headline <- html %>%
+      rvest::html_elements("title") %>%
+      rvest::html_text()
+
+    # author
+    author <- html %>%
+      rvest::html_elements("[name=\"author\"]") %>%
+      rvest::html_attr("content") %>%
+      toString()
+
+    # text
+    text <- html %>%
+      rvest::html_elements("p:not([id|=\"footer\"])") %>%
+      rvest::html_text2() %>%
+      paste(collapse = "\n")
+
+    tibble::tibble(
+      datetime,
+      author,
+      headline,
+      text
+    )
+  }) %>%
+    cbind(x) %>%
+    normalise_df() %>%
+    return()
+}
diff --git a/README.Rmd b/README.Rmd
index 995472f..249ecf3 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -121,7 +121,7 @@ tibble::tribble(
   "dailymail.co.uk",           "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
   "newsweek.com",           "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
   "marketwatch.com",         "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
-  "wsj.com",         "![](https://img.shields.io/badge/status-broken-%23D8634C)",  "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
+  "wsj.com",         "![](https://img.shields.io/badge/status-gold-%23ffd700.svg)",  "Johannes B. Gruber", "",
   "nytimes.com",         "![](https://img.shields.io/badge/status-broken-%23D8634C)",  "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
   "nypost.com",         "![](https://img.shields.io/badge/status-broken-%23D8634C)",  "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
   "edition.cnn.com",         "![](https://img.shields.io/badge/status-broken-%23D8634C)",  "Johannes B. Gruber", "[#1](https://github.com/JBGruber/paperboy/issues/1)",
diff --git a/README.md b/README.md
index ec0b8ba..ae9487e 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ it via a pull request.
 | nytimes.com          | ![](https://img.shields.io/badge/status-broken-%23D8634C)     | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
 | theguardian.com      | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg)   | Johannes B. Gruber |                                                      |
 | washingtonpost.com   | ![](https://img.shields.io/badge/status-broken-%23D8634C)     | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
-| wsj.com              | ![](https://img.shields.io/badge/status-broken-%23D8634C)     | Johannes B. Gruber | [#1](https://github.com/JBGruber/paperboy/issues/1) |
+| wsj.com              | ![](https://img.shields.io/badge/status-gold-%23ffd700.svg)   | Johannes B. Gruber |                                                      |
 
 -   ![](https://img.shields.io/badge/status-gold-%23ffd700.svg): Runs
     without known issues