From 03ba4ddd6846cf2a62386029c96206b3cebdfc8b Mon Sep 17 00:00:00 2001 From: Martin Westgate Date: Tue, 4 Jul 2023 15:54:53 +1000 Subject: [PATCH] Improve tidyverse compatibility - swap from `data.frame` to `tibble` format throughout package - decommission `class() ==` syntax in favor of `inherits()` #22 - make `merge_columns()` a shell for `dplyr::bind_rows()` --- DESCRIPTION | 3 +- NAMESPACE | 2 ++ R/bibliography_functions.R | 2 +- R/clean_functions.R | 17 +++++++++++ R/deduplication_functions.R | 2 +- R/format_citation.R | 2 +- R/generic_data_functions.R | 61 ++++++++----------------------------- R/read_refs.R | 3 +- tests/testthat/test-clean.R | 14 +++++++++ tests/testthat/test-read.R | 2 ++ 10 files changed, 55 insertions(+), 53 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6ecee29..4bfee5e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,8 @@ Description: A critical first step in systematic literature reviews Depends: R (>= 3.5.0) Imports: rlang, - stringdist + stringdist, + tibble Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index d38cb28..89aafe6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,6 +45,8 @@ export(string_soundex) export(write_bib) export(write_refs) export(write_ris) +importFrom(dplyr,bind_rows) importFrom(rlang,abort) importFrom(rlang,warn) importFrom(stringdist,stringdist) +importFrom(tibble,tibble) diff --git a/R/bibliography_functions.R b/R/bibliography_functions.R index 1aaf876..85e61ac 100644 --- a/R/bibliography_functions.R +++ b/R/bibliography_functions.R @@ -162,7 +162,7 @@ as.data.frame.bibliography <- function(x, ...){ #' @export as.bibliography <- function(x, ...){ - if(class(x) != "data.frame"){ + if(!inherits(x, "data.frame")){ abort("as.bibliography can only be called for objects of class 'data.frame'") } diff --git a/R/clean_functions.R b/R/clean_functions.R index 64d06b6..737c701 100644 --- a/R/clean_functions.R +++ b/R/clean_functions.R @@ -12,6 +12,7 @@ clean_df <- function(data){ if(any(colnames(data) == "author")){ data$author <- clean_authors(data$author) } + data <- remove_factors(data) return(data) } @@ -50,3 +51,19 @@ clean_colnames <- function( x <- gsub(" ", "_", x) return(x) } + +#' Remove factors from an object +#' +#' Internal functions called by `clean_df()`: +#' @description This function converts factors to characters to avoid errors with +#' levels. +#' @param z A data.frame +#' @return Returns the input data.frame with all factors converted to character. +#' @noRd +#' @keywords Internal +remove_factors <- function(z){ + z[] <- lapply(z, function(x){ + if(is.factor(x)){as.character(x)}else{x} + }) + return(z) +} diff --git a/R/deduplication_functions.R b/R/deduplication_functions.R index 22ab8e7..3f41416 100644 --- a/R/deduplication_functions.R +++ b/R/deduplication_functions.R @@ -41,7 +41,7 @@ find_duplicates <- function( if(inherits(data, "data.frame")){ abort("'data' must be a character vector, not a data.frame") } - if(class(data) != "character"){ + if(!inherits(data, "character")){ data <- as.character(data) } diff --git a/R/format_citation.R b/R/format_citation.R index baffb10..332ebde 100644 --- a/R/format_citation.R +++ b/R/format_citation.R @@ -30,7 +30,7 @@ format_citation <- function( abort("format_citation expects input data to be an object of class data.frame, bibliography, or list") } - if(class(data)!="data.frame"){ + if(!inherits(data, "data.frame")){ data <- as.data.frame(data) } diff --git a/R/generic_data_functions.R b/R/generic_data_functions.R index 110e567..506c917 100644 --- a/R/generic_data_functions.R +++ b/R/generic_data_functions.R @@ -32,13 +32,15 @@ match_columns <- function(df){ #' Bind two or more data frames with different columns #' -#' @description Takes two or more data.frames with different column names or -#' different column orders and binds them to a single data.frame. -#' NOTE: Should be possible to replace this with `dplyr::bind_rows()` +#' @description Takes two or more `data.frames` with different column names or +#' different column orders and binds them to a single `data.frame.` This +#' function is maintained for backwards compatibility, but it is synonymous with +#' `dplyr::bind_rows()` and will be depracated in future. #' @param x Either a data.frame or a list of data.frames. #' @param y A data.frame, optional if x is a list. #' @return Returns a single data.frame with all the input data frames merged. #' @example inst/examples/merge_columns.R +#' @importFrom dplyr bind_rows #' @importFrom rlang abort #' @export merge_columns <- function( @@ -48,57 +50,20 @@ merge_columns <- function( if(missing(x)){ abort("object x is missing with no default") } - - if(!any(c("data.frame", "list") == class(x))){ + if(!(inherits(x, "data.frame") | inherits(x, "list"))){ abort("object x must be either a data.frame or a list") } - - if(class(x) == "data.frame"){ + if(inherits(x, "data.frame")){ if(missing(y)){ - abort("If x is a data.frame, then y must be supplied") + return(x) + # abort("If x is a data.frame, then y must be supplied") + }else{ + x <- list(x, y) } - x <- list(x, y) }else{ # i.e. for lists - if(!all(unlist(lapply(x, class)) == "data.frame")){ + if(!all(unlist(lapply(x, function(a){inherits(a, "data.frame")})))){ abort("x must only contain data.frames") } } - - x <- lapply(x, remove_factors) - - col_names_all <- unique(unlist(lapply(x, colnames))) - - result_list <- lapply(x, function(a, cn){ - missing_names <- !(cn %in% colnames(a)) - if(any(missing_names)){ - new_names <- cn[missing_names] - result <- data.frame( - c(a, sapply(new_names, function(b){NA})), - stringsAsFactors = FALSE) - return(result[, cn]) - }else{ - return(a[, cn]) - } - }, - cn = col_names_all - ) - - return(do.call(rbind, result_list)) - -} - -#' Remove factors from an object -#' -#' Internal functions called by merge_columns: -#' @description This function converts factors to characters to avoid errors with -#' levels. -#' @param z A data.frame -#' @return Returns the input data.frame with all factors converted to character. -#' @noRd -#' @keywords Internal -remove_factors <- function(z){ - z[] <- lapply(z, function(x){ - if(is.factor(x)){as.character(x)}else{x} - }) - return(z) + bind_rows(x) } diff --git a/R/read_refs.R b/R/read_refs.R index 47af8cd..f9f211c 100644 --- a/R/read_refs.R +++ b/R/read_refs.R @@ -95,6 +95,7 @@ read_refs <- function( #' @return Returns a data.frame or list of assembled search results. #' @importFrom rlang abort #' @importFrom rlang warn +#' @importFrom tibble tibble #' @noRd #' @keywords Internal read_ref <- function( @@ -141,7 +142,7 @@ read_ref <- function( }else{ if(return_df){df <- as.data.frame.bibliography(df)} } - if(inherits(df, "data.frame")){df <- clean_df(df)} + if(inherits(df, "data.frame")){df <- tibble(clean_df(df))} if(verbose){cat("done\n")} return(df) diff --git a/tests/testthat/test-clean.R b/tests/testthat/test-clean.R index 7b9b0dd..26bad00 100644 --- a/tests/testthat/test-clean.R +++ b/tests/testthat/test-clean.R @@ -15,3 +15,17 @@ test_that("clean_colnames() works", { expect_false(any(duplicated(cleaned))) expect_false(any(grepl("^(X|Y|Z)\\.+", cleaned))) }) + +test_that("clean_df() cleans authors, colnames and factors", { + test_df <- data.frame( + authors = c("Haddaway, N.R., A. Feirman AND M.J. Grainger", "Some authors"), + "..misc." = c("text", "text2"), + JOURNAL = as.factor(c("A journal", "Another journal")) + ) + result <- clean_df(test_df) + expect_false(any(grepl("AND", result$authors))) + test_cols <- colnames(result) + expect_false(any(grepl("[[:punct::]]", test_cols))) + expect_equal(tolower(test_cols), test_cols) + expect_false(any(unlist(lapply(result, is.factor)))) +}) diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R index 47088db..6ccbaa1 100644 --- a/tests/testthat/test-read.R +++ b/tests/testthat/test-read.R @@ -2,6 +2,7 @@ test_that("read_ref() works for simple imports", { df <- read_refs("testdata/eviatlas.txt", return_df = TRUE, verbose = FALSE) + expect_true(inherits(df, c("tbl", "data.frame"))) expect_equal(nrow(df), 1) expect_true(any(grep("EviAtlas", df[1, ]))) }) @@ -12,6 +13,7 @@ test_that("read_refs() works for simple imports", { df <- read_refs(testfiles, return_df = TRUE, verbose = FALSE) + expect_true(inherits(df, c("tbl", "data.frame"))) expect_equal(nrow(df), 4) expect_true(any(grep("EviAtlas", df[1, ]))) expect_true(any(grep("litsearchr", df[2, ])))