Added theguardian scraper (#1)

JBGruber · Jul 14, 2021 · 0efc2fd · 0efc2fd
1 parent 8556014
commit 0efc2fd
Show file tree

Hide file tree

Showing 20 changed files with 192 additions and 111 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,4 @@
 /tests/local-files
 ^\.github$
 ^codecov\.yml$
+Update_package.R
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -67,6 +67,7 @@ jobs:
         run: |
           remotes::install_deps(dependencies = TRUE)
           remotes::install_cran("rcmdcheck")
+          remotes::install_cran("covr")
         shell: Rscript {0}
 
       - name: Check
@@ -77,6 +78,10 @@ jobs:
           rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
         shell: Rscript {0}
 
+      - name: Test coverage
+        run: covr::codecov()
+        shell: Rscript {0}
+
       - name: Upload check results
         if: failure()
         uses: actions/upload-artifact@main

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@
 .Rdata
 .httr-oauth
 .DS_Store
+tests/spelling.Rout.save
+tests/local-files
+Update_package.R
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: paperboy
 Title: Comprehensive collection of news media scrapers
 Version: 0.0.1.9000
-Date: 2021-07-11
+Date: 2021-07-14
 Authors@R: person("Johannes", "Gruber", email = "[email protected]",
   role = c("aut", "cre"))
 Description: A comprehensive collection of webscraping scripts for news media sites.
@@ -11,11 +11,17 @@ License: GPL-3
 Imports:
     curl,
     dplyr,
+    magrittr,
+    progress,
+    lubridate,
+    purrr,
+    rvest,
     rlang,
     tibble,
     tidyr,
     tidyselect,
-    urltools
+    urltools,
+    utils
 Suggests:
     knitr,
     testthat,

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,8 @@ S3method(deliver,www_buzzfeed_com)
 S3method(deliver,www_forbes_com)
 S3method(deliver,www_huffingtonpost_co_uk)
 S3method(deliver,www_theguardian_com)
+export("%>%")
 export(deliver)
 export(expandurls)
+importFrom(magrittr,"%>%")
 importFrom(rlang,":=")
diff --git a/R/deliver.R b/R/deliver.R
@@ -4,17 +4,19 @@
 #'   and call the appropriate webscraper.
 #'
 #' @param url The URL of the web article.
+#' @param verbose A logical flag indicating whether information should be
+#'   printed to the screen.
 #' @param ... Passed on to respective scraper.
 #'
 #' @return A data.frame (tibble) with media data and full text.
 #' @export
-deliver <- function(url, ...) {
+deliver <- function(url, verbose = TRUE, ...) {
   UseMethod("deliver")
 }
 
 #' @rdname deliver
 #' @export
-deliver.default <- function(url, ...) {
+deliver.default <- function(url, verbose = TRUE, ...) {
   if ("domain" %in% names(url)) {
     warning("No method for ", url$domain[1], " yet. Url ignored.")
     NULL
@@ -25,40 +27,31 @@ deliver.default <- function(url, ...) {
 
 #' @rdname deliver
 #' @export
-deliver.character <- function(url, ...) {
+deliver.character <- function(url, verbose = TRUE, ...) {
 
-  pages <- expandurls(url)
+  pages <- expandurls(url, verbose = verbose)
 
   pages <- split(pages, pages$domain, drop = TRUE)
 
   out <- lapply(pages, function(u) {
-    class(u) <- c(gsub(".", "_", u$domain, fixed = TRUE), class(u))
-    deliver(u, ...)
+    class(u) <- c(
+      gsub(".", "_", utils::head(u$domain, 1), fixed = TRUE),
+      class(u)
+    )
+    deliver(u, verbose = verbose, ...)
   })
 
   return(dplyr::bind_rows(out))
 }
 
 #' @rdname deliver
 #' @export
-deliver.www_theguardian_com <- function(url, ...) {
+deliver.www_buzzfeed_com <- function(url, verbose = TRUE, ...) {
   return(normalise_df(url))
 }
 
 #' @rdname deliver
 #' @export
-deliver.www_huffingtonpost_co_uk <- function(url, ...) {
-  return(normalise_df(url))
-}
-
-#' @rdname deliver
-#' @export
-deliver.www_buzzfeed_com <- function(url, ...) {
-  return(normalise_df(url))
-}
-
-#' @rdname deliver
-#' @export
-deliver.www_forbes_com <- function(url, ...) {
+deliver.www_forbes_com <- function(url, verbose = TRUE, ...) {
   return(normalise_df(url))
 }
diff --git a/R/deliver.huffingtonpost.co.uk.R b/R/deliver.huffingtonpost.co.uk.R
@@ -0,0 +1,5 @@
+#' @rdname deliver
+#' @export
+deliver.www_huffingtonpost_co_uk <- function(url, ...) {
+  return(normalise_df(url))
+}
diff --git a/R/deliver.theguardian.com.R b/R/deliver.theguardian.com.R
@@ -0,0 +1,58 @@
+#' @rdname deliver
+#' @export
+deliver.www_theguardian_com <- function(url, verbose = TRUE, ...) {
+
+  if (!"tbl_df" %in% class(url))
+    stop("Wrong object passed to internal deliver function: ", class(url))
+
+  if (verbose) message("\t...fetching theguardian.com articles")
+
+  pb <- make_pb(url)
+
+  purrr::map_df(url$expanded_url, function(u) {
+
+    if (verbose) pb$tick()
+
+    html <- rvest::read_html(u)
+
+    # datetime
+    datetime <- html %>%
+      rvest::html_elements("[property=\"article:published_time\"]") %>%
+      rvest::html_attr("content") %>%
+      lubridate::as_datetime()
+
+    # headline
+    headline <- html %>%
+      rvest::html_elements("[property=\"og:title\"]") %>%
+      rvest::html_attr("content")
+
+    # author
+    author <- html %>%
+      rvest::html_elements("[property=\"article:author\"]") %>%
+      rvest::html_attr("content")
+
+    if (length(author) == 0) {
+      author <- html %>%
+        rvest::html_elements("[name=\"author\"]") %>%
+        rvest::html_attr("content")
+    }
+
+    if (length(author) > 1) author <- toString(author)
+
+    # text
+    text <- html %>%
+      rvest::html_elements("p") %>%
+      rvest::html_text() %>%
+      paste(collapse = "\n\n")
+
+    tibble::tibble(
+      datetime,
+      author,
+      headline,
+      text
+    )
+  }) %>%
+    cbind(url) %>%
+    normalise_df() %>%
+    return()
+}
diff --git a/R/expandurls.R b/R/expandurls.R
@@ -5,6 +5,8 @@
 #'   seconds). If the query finishes earlier, results are returned immediately.
 #' @param ignore_fails normally the function errors when a url can't be reached
 #'   due to connection issues. Setting to TRUE ignores this.
+#' @param verbose A logical flag indicating whether information should be
+#'   printed to the screen.
 #' @param ... Currently not used
 #'
 #' @return Character object with full (i.e., unshortened) URLs.
@@ -14,6 +16,7 @@
 expandurls <- function(url,
                        timeout = 15,
                        ignore_fails = FALSE,
+                       verbose = FALSE,
                        ...) {
 
   # prevent duplicates
@@ -24,7 +27,7 @@ expandurls <- function(url,
   pages <- list()
 
   # create different parser function for each request to identify results
-  parse_response <- function(url){
+  parse_response <- function(url) {
     function(req) {
       pages[[url]] <<- tibble::tibble(
         expanded_url = req$url,
@@ -76,5 +79,10 @@ expandurls <- function(url,
     )
   }
 
+  if (verbose) message(length(url),
+                       " links from ",
+                       length(unique(out$domain)),
+                       " domains unshortened.")
+
   return(out)
 }
diff --git a/R/utils.R b/R/utils.R
@@ -1,3 +1,15 @@
+#' @importFrom magrittr %>%
+#' @export
+magrittr::`%>%`
+
+#
+make_pb <- function(df) {
+  progress::progress_bar$new(
+    format = "[:bar] :percent eta: :eta",
+    total = nrow(df)
+  )
+}
+
 #
 normalise_df <- function(df) {
   df <- tibble::as_tibble(df)
@@ -7,8 +19,8 @@ normalise_df <- function(df) {
     "domain",
     "status",
     "datetime",
-    "headline",
     "author",
+    "headline",
     "text"
   )
   missing_cols <- setdiff(expected_cols, colnames(df))

diff --git a/README.Rmd b/README.Rmd
@@ -13,6 +13,13 @@ knitr::opts_chunk$set(
 )
 
 knit_print.tbl_df = function(x, ...) {
+  x <- as.data.frame(lapply(x, function(c) {
+    if (is.character(c)) {
+      ifelse(nchar(c) > 25, paste0(substr(c, 1, 25), "..."), c)
+    } else {
+      c
+    }
+  }))
   res = paste(c("", "", knitr::kable(x)), collapse = "\n")
   knitr::asis_output(res)
 }

diff --git a/README.md b/README.md
@@ -43,12 +43,14 @@ links to a media article to the main function, `deliver`:
 ``` r
 library(paperboy)
 df <- deliver("https://tinyurl.com/386e98k5")
+#> 1 links from 1 domains unshortened.
+#>  ...fetching theguardian.com articles
 df
 ```
 
-| url                            | expanded\_url                                                                     | domain              | status | datetime | headline | author | text | misc |
-|:-------------------------------|:----------------------------------------------------------------------------------|:--------------------|-------:|:---------|:---------|:-------|:-----|:-----|
-| <https://tinyurl.com/386e98k5> | <https://www.theguardian.com/tv-and-radio/2021/jul/12/should-marge-divorce-homer> | www.theguardian.com |    200 | NA       | NA       | NA     | NA   | NULL |
+| url                          | expanded\_url                | domain              | status | datetime            | author                       | headline                   | text                       |
+|:-----------------------------|:-----------------------------|:--------------------|-------:|:--------------------|:-----------------------------|:---------------------------|:---------------------------|
+| <https://tinyurl.com/386e9>… | <https://www.theguardian.c>… | www.theguardian.com |    200 | 2021-07-12 12:00:13 | <https://www.theguardian.c>… | ‘A woman trapped in an im… | The Simpson couple have e… |
 
 The returned `data.frame` contains important meta information about the
 news items and their full text. Notice, that the function had no problem
@@ -58,8 +60,9 @@ therefore often encounter this warning:
 
 ``` r
 deliver(url = "google.com")
-#> Warning in deliver.default(u, ...): No method for www.google.com yet. Url
-#> ignored.
+#> 1 links from 1 domains unshortened.
+#> Warning in deliver.default(u, verbose = verbose, ...): No method for
+#> www.google.com yet. Url ignored.
 ```
 
 If you enter a vector of multiple URLs, the unsupported ones will be
@@ -71,10 +74,10 @@ column will be different from `200` and contain `NA`s.
 
 Every webscraper should retrieve a `tibble` with the following format:
 
-| url                                 | expanded\_url | domain     | status           | datetime             | headline     | author     | text          | misc                                                                      |
-|:------------------------------------|:--------------|:-----------|:-----------------|:---------------------|:-------------|:-----------|:--------------|:--------------------------------------------------------------------------|
-| character                           | character     | character  | integer          | as.POSIXct           | character    | character  | character     | list                                                                      |
-| the original url fed to the scraper | the full url  | the domain | http status code | publication datetime | the headline | the author | the full text | all other information that can be consistently found on a specific outlet |
+| url                        | expanded\_url | domain     | status           | datetime             | headline     | author     | text          | misc                       |
+|:---------------------------|:--------------|:-----------|:-----------------|:---------------------|:-------------|:-----------|:--------------|:---------------------------|
+| character                  | character     | character  | integer          | as.POSIXct           | character    | character  | character     | list                       |
+| the original url fed to t… | the full url  | the domain | http status code | publication datetime | the headline | the author | the full text | all other information tha… |
 
 Since some outlets will give you additional information, the `misc`
 column was included so these can be retained. If you have a scraper you
@@ -84,12 +87,12 @@ it via a pull request.
 
 ## Available Scrapers
 
-| domain               | status | author             | note                                                 |
-|:---------------------|:-------|:-------------------|:-----------------------------------------------------|
-| theguardian.com      | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
-| huffingtonpost.co.uk | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
-| buzzfeed.com         | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
-| forbes.com           | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
+| domain               | status | author             | note                            |
+|:---------------------|:-------|:-------------------|:--------------------------------|
+| theguardian.com      | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J>… |
+| huffingtonpost.co.uk | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J>… |
+| buzzfeed.com         | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J>… |
+| forbes.com           | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J>… |
 
 -   **Gold**: Runs without any issues
 -   **Silver**: Runs with some issues