Skip to content

Commit

Permalink
Added theguardian scraper (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Jul 14, 2021
1 parent 8556014 commit 0efc2fd
Show file tree
Hide file tree
Showing 20 changed files with 192 additions and 111 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
/tests/local-files
^\.github$
^codecov\.yml$
Update_package.R
5 changes: 5 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ jobs:
run: |
remotes::install_deps(dependencies = TRUE)
remotes::install_cran("rcmdcheck")
remotes::install_cran("covr")
shell: Rscript {0}

- name: Check
Expand All @@ -77,6 +78,10 @@ jobs:
rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
shell: Rscript {0}

- name: Test coverage
run: covr::codecov()
shell: Rscript {0}

- name: Upload check results
if: failure()
uses: actions/upload-artifact@main
Expand Down
48 changes: 0 additions & 48 deletions .github/workflows/test-coverage.yaml

This file was deleted.

3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
.Rdata
.httr-oauth
.DS_Store
tests/spelling.Rout.save
tests/local-files
Update_package.R
10 changes: 8 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: paperboy
Title: Comprehensive collection of news media scrapers
Version: 0.0.1.9000
Date: 2021-07-11
Date: 2021-07-14
Authors@R: person("Johannes", "Gruber", email = "[email protected]",
role = c("aut", "cre"))
Description: A comprehensive collection of webscraping scripts for news media sites.
Expand All @@ -11,11 +11,17 @@ License: GPL-3
Imports:
curl,
dplyr,
magrittr,
progress,
lubridate,
purrr,
rvest,
rlang,
tibble,
tidyr,
tidyselect,
urltools
urltools,
utils
Suggests:
knitr,
testthat,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ S3method(deliver,www_buzzfeed_com)
S3method(deliver,www_forbes_com)
S3method(deliver,www_huffingtonpost_co_uk)
S3method(deliver,www_theguardian_com)
export("%>%")
export(deliver)
export(expandurls)
importFrom(magrittr,"%>%")
importFrom(rlang,":=")
33 changes: 13 additions & 20 deletions R/deliver.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@
#' and call the appropriate webscraper.
#'
#' @param url The URL of the web article.
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @param ... Passed on to respective scraper.
#'
#' @return A data.frame (tibble) with media data and full text.
#' @export
deliver <- function(url, ...) {
deliver <- function(url, verbose = TRUE, ...) {
UseMethod("deliver")
}

#' @rdname deliver
#' @export
deliver.default <- function(url, ...) {
deliver.default <- function(url, verbose = TRUE, ...) {
if ("domain" %in% names(url)) {
warning("No method for ", url$domain[1], " yet. Url ignored.")
NULL
Expand All @@ -25,40 +27,31 @@ deliver.default <- function(url, ...) {

#' @rdname deliver
#' @export
deliver.character <- function(url, ...) {
deliver.character <- function(url, verbose = TRUE, ...) {

pages <- expandurls(url)
pages <- expandurls(url, verbose = verbose)

pages <- split(pages, pages$domain, drop = TRUE)

out <- lapply(pages, function(u) {
class(u) <- c(gsub(".", "_", u$domain, fixed = TRUE), class(u))
deliver(u, ...)
class(u) <- c(
gsub(".", "_", utils::head(u$domain, 1), fixed = TRUE),
class(u)
)
deliver(u, verbose = verbose, ...)
})

return(dplyr::bind_rows(out))
}

#' @rdname deliver
#' @export
deliver.www_theguardian_com <- function(url, ...) {
deliver.www_buzzfeed_com <- function(url, verbose = TRUE, ...) {
return(normalise_df(url))
}

#' @rdname deliver
#' @export
deliver.www_huffingtonpost_co_uk <- function(url, ...) {
return(normalise_df(url))
}

#' @rdname deliver
#' @export
deliver.www_buzzfeed_com <- function(url, ...) {
return(normalise_df(url))
}

#' @rdname deliver
#' @export
deliver.www_forbes_com <- function(url, ...) {
deliver.www_forbes_com <- function(url, verbose = TRUE, ...) {
return(normalise_df(url))
}
5 changes: 5 additions & 0 deletions R/deliver.huffingtonpost.co.uk.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#' @rdname deliver
#' @export
deliver.www_huffingtonpost_co_uk <- function(url, ...) {
return(normalise_df(url))
}
58 changes: 58 additions & 0 deletions R/deliver.theguardian.com.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#' @rdname deliver
#' @export
deliver.www_theguardian_com <- function(url, verbose = TRUE, ...) {

if (!"tbl_df" %in% class(url))
stop("Wrong object passed to internal deliver function: ", class(url))

if (verbose) message("\t...fetching theguardian.com articles")

pb <- make_pb(url)

purrr::map_df(url$expanded_url, function(u) {

if (verbose) pb$tick()

html <- rvest::read_html(u)

# datetime
datetime <- html %>%
rvest::html_elements("[property=\"article:published_time\"]") %>%
rvest::html_attr("content") %>%
lubridate::as_datetime()

# headline
headline <- html %>%
rvest::html_elements("[property=\"og:title\"]") %>%
rvest::html_attr("content")

# author
author <- html %>%
rvest::html_elements("[property=\"article:author\"]") %>%
rvest::html_attr("content")

if (length(author) == 0) {
author <- html %>%
rvest::html_elements("[name=\"author\"]") %>%
rvest::html_attr("content")
}

if (length(author) > 1) author <- toString(author)

# text
text <- html %>%
rvest::html_elements("p") %>%
rvest::html_text() %>%
paste(collapse = "\n\n")

tibble::tibble(
datetime,
author,
headline,
text
)
}) %>%
cbind(url) %>%
normalise_df() %>%
return()
}
10 changes: 9 additions & 1 deletion R/expandurls.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#' seconds). If the query finishes earlier, results are returned immediately.
#' @param ignore_fails normally the function errors when a url can't be reached
#' due to connection issues. Setting to TRUE ignores this.
#' @param verbose A logical flag indicating whether information should be
#' printed to the screen.
#' @param ... Currently not used
#'
#' @return Character object with full (i.e., unshortened) URLs.
Expand All @@ -14,6 +16,7 @@
expandurls <- function(url,
timeout = 15,
ignore_fails = FALSE,
verbose = FALSE,
...) {

# prevent duplicates
Expand All @@ -24,7 +27,7 @@ expandurls <- function(url,
pages <- list()

# create different parser function for each request to identify results
parse_response <- function(url){
parse_response <- function(url) {
function(req) {
pages[[url]] <<- tibble::tibble(
expanded_url = req$url,
Expand Down Expand Up @@ -76,5 +79,10 @@ expandurls <- function(url,
)
}

if (verbose) message(length(url),
" links from ",
length(unique(out$domain)),
" domains unshortened.")

return(out)
}
14 changes: 13 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
#' @importFrom magrittr %>%
#' @export
magrittr::`%>%`

#
make_pb <- function(df) {
progress::progress_bar$new(
format = "[:bar] :percent eta: :eta",
total = nrow(df)
)
}

#
normalise_df <- function(df) {
df <- tibble::as_tibble(df)
Expand All @@ -7,8 +19,8 @@ normalise_df <- function(df) {
"domain",
"status",
"datetime",
"headline",
"author",
"headline",
"text"
)
missing_cols <- setdiff(expected_cols, colnames(df))
Expand Down
7 changes: 7 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ knitr::opts_chunk$set(
)
knit_print.tbl_df = function(x, ...) {
x <- as.data.frame(lapply(x, function(c) {
if (is.character(c)) {
ifelse(nchar(c) > 25, paste0(substr(c, 1, 25), "..."), c)
} else {
c
}
}))
res = paste(c("", "", knitr::kable(x)), collapse = "\n")
knitr::asis_output(res)
}
Expand Down
33 changes: 18 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,14 @@ links to a media article to the main function, `deliver`:
``` r
library(paperboy)
df <- deliver("https://tinyurl.com/386e98k5")
#> 1 links from 1 domains unshortened.
#> ...fetching theguardian.com articles
df
```

| url | expanded\_url | domain | status | datetime | headline | author | text | misc |
|:-------------------------------|:----------------------------------------------------------------------------------|:--------------------|-------:|:---------|:---------|:-------|:-----|:-----|
| <https://tinyurl.com/386e98k5> | <https://www.theguardian.com/tv-and-radio/2021/jul/12/should-marge-divorce-homer> | www.theguardian.com | 200 | NA | NA | NA | NA | NULL |
| url | expanded\_url | domain | status | datetime | author | headline | text |
|:-----------------------------|:-----------------------------|:--------------------|-------:|:--------------------|:-----------------------------|:---------------------------|:---------------------------|
| <https://tinyurl.com/386e9> | <https://www.theguardian.c> | www.theguardian.com | 200 | 2021-07-12 12:00:13 | <https://www.theguardian.c>| ‘A woman trapped in an im… | The Simpson couple have e… |

The returned `data.frame` contains important meta information about the
news items and their full text. Notice, that the function had no problem
Expand All @@ -58,8 +60,9 @@ therefore often encounter this warning:

``` r
deliver(url = "google.com")
#> Warning in deliver.default(u, ...): No method for www.google.com yet. Url
#> ignored.
#> 1 links from 1 domains unshortened.
#> Warning in deliver.default(u, verbose = verbose, ...): No method for
#> www.google.com yet. Url ignored.
```

If you enter a vector of multiple URLs, the unsupported ones will be
Expand All @@ -71,10 +74,10 @@ column will be different from `200` and contain `NA`s.

Every webscraper should retrieve a `tibble` with the following format:

| url | expanded\_url | domain | status | datetime | headline | author | text | misc |
|:------------------------------------|:--------------|:-----------|:-----------------|:---------------------|:-------------|:-----------|:--------------|:--------------------------------------------------------------------------|
| character | character | character | integer | as.POSIXct | character | character | character | list |
| the original url fed to the scraper | the full url | the domain | http status code | publication datetime | the headline | the author | the full text | all other information that can be consistently found on a specific outlet |
| url | expanded\_url | domain | status | datetime | headline | author | text | misc |
|:---------------------------|:--------------|:-----------|:-----------------|:---------------------|:-------------|:-----------|:--------------|:---------------------------|
| character | character | character | integer | as.POSIXct | character | character | character | list |
| the original url fed to t… | the full url | the domain | http status code | publication datetime | the headline | the author | the full text | all other information tha… |

Since some outlets will give you additional information, the `misc`
column was included so these can be retained. If you have a scraper you
Expand All @@ -84,12 +87,12 @@ it via a pull request.

## Available Scrapers

| domain | status | author | note |
|:---------------------|:-------|:-------------------|:-----------------------------------------------------|
| theguardian.com | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
| huffingtonpost.co.uk | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
| buzzfeed.com | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
| forbes.com | Broken | Johannes B. Gruber | [\#1](https://github.com/JBGruber/paperboy/issues/1) |
| domain | status | author | note |
|:---------------------|:-------|:-------------------|:--------------------------------|
| theguardian.com | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J> |
| huffingtonpost.co.uk | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J> |
| buzzfeed.com | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J> |
| forbes.com | Broken | Johannes B. Gruber | \[\#1\](<https://github.com/J> |

- **Gold**: Runs without any issues
- **Silver**: Runs with some issues
Expand Down
Loading

0 comments on commit 0efc2fd

Please sign in to comment.