From 03ba4ddd6846cf2a62386029c96206b3cebdfc8b Mon Sep 17 00:00:00 2001
From: Martin Westgate <martinjwestgate@gmail.com>
Date: Tue, 4 Jul 2023 15:54:53 +1000
Subject: [PATCH] Improve tidyverse compatibility

- swap from `data.frame` to `tibble` format throughout package
- decommission `class() ==` syntax in favor of `inherits()` #22
- make `merge_columns()` a shell for `dplyr::bind_rows()`
---
 DESCRIPTION                 |  3 +-
 NAMESPACE                   |  2 ++
 R/bibliography_functions.R  |  2 +-
 R/clean_functions.R         | 17 +++++++++++
 R/deduplication_functions.R |  2 +-
 R/format_citation.R         |  2 +-
 R/generic_data_functions.R  | 61 ++++++++-----------------------------
 R/read_refs.R               |  3 +-
 tests/testthat/test-clean.R | 14 +++++++++
 tests/testthat/test-read.R  |  2 ++
 10 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 6ecee29..4bfee5e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,7 +26,8 @@ Description: A critical first step in systematic literature reviews
 Depends: R (>= 3.5.0)
 Imports:
     rlang,
-    stringdist
+    stringdist,
+    tibble
 Suggests: 
     knitr, 
     rmarkdown, 
diff --git a/NAMESPACE b/NAMESPACE
index d38cb28..89aafe6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -45,6 +45,8 @@ export(string_soundex)
 export(write_bib)
 export(write_refs)
 export(write_ris)
+importFrom(dplyr,bind_rows)
 importFrom(rlang,abort)
 importFrom(rlang,warn)
 importFrom(stringdist,stringdist)
+importFrom(tibble,tibble)
diff --git a/R/bibliography_functions.R b/R/bibliography_functions.R
index 1aaf876..85e61ac 100644
--- a/R/bibliography_functions.R
+++ b/R/bibliography_functions.R
@@ -162,7 +162,7 @@ as.data.frame.bibliography <- function(x, ...){
 #' @export
 as.bibliography <- function(x, ...){
 
-  if(class(x) != "data.frame"){
+  if(!inherits(x, "data.frame")){
     abort("as.bibliography can only be called for objects of class 'data.frame'")
   }
 
diff --git a/R/clean_functions.R b/R/clean_functions.R
index 64d06b6..737c701 100644
--- a/R/clean_functions.R
+++ b/R/clean_functions.R
@@ -12,6 +12,7 @@ clean_df <- function(data){
   if(any(colnames(data) == "author")){
     data$author <- clean_authors(data$author)
   }
+  data <- remove_factors(data)
   return(data)
 }
 
@@ -50,3 +51,19 @@ clean_colnames <- function(
   x <- gsub(" ", "_", x)
   return(x)
 }
+
+#' Remove factors from an object
+#'
+#' Internal functions called by `clean_df()`:
+#' @description This function converts factors to characters to avoid errors with
+#' levels.
+#' @param z A data.frame
+#' @return Returns the input data.frame with all factors converted to character.
+#' @noRd
+#' @keywords Internal
+remove_factors <- function(z){
+  z[] <- lapply(z, function(x){
+    if(is.factor(x)){as.character(x)}else{x}
+  })
+  return(z)
+}
diff --git a/R/deduplication_functions.R b/R/deduplication_functions.R
index 22ab8e7..3f41416 100644
--- a/R/deduplication_functions.R
+++ b/R/deduplication_functions.R
@@ -41,7 +41,7 @@ find_duplicates <- function(
   if(inherits(data, "data.frame")){
     abort("'data' must be a character vector, not a data.frame")
   }
-  if(class(data) != "character"){
+  if(!inherits(data, "character")){
     data <- as.character(data)
   }
 
diff --git a/R/format_citation.R b/R/format_citation.R
index baffb10..332ebde 100644
--- a/R/format_citation.R
+++ b/R/format_citation.R
@@ -30,7 +30,7 @@ format_citation <- function(
     abort("format_citation expects input data to be an object of class data.frame, bibliography, or list")
   }
 
-  if(class(data)!="data.frame"){
+  if(!inherits(data, "data.frame")){
     data <- as.data.frame(data)
   }
 
diff --git a/R/generic_data_functions.R b/R/generic_data_functions.R
index 110e567..506c917 100644
--- a/R/generic_data_functions.R
+++ b/R/generic_data_functions.R
@@ -32,13 +32,15 @@ match_columns <- function(df){
 
 #' Bind two or more data frames with different columns
 #'
-#' @description Takes two or more data.frames with different column names or
-#' different column orders and binds them to a single data.frame.
-#' NOTE: Should be possible to replace this with `dplyr::bind_rows()`
+#' @description Takes two or more `data.frames` with different column names or
+#' different column orders and binds them to a single `data.frame.` This
+#' function is maintained for backwards compatibility, but it is synonymous with
+#' `dplyr::bind_rows()` and will be depracated in future.
 #' @param x Either a data.frame or a list of data.frames.
 #' @param y A data.frame, optional if x is a list.
 #' @return Returns a single data.frame with all the input data frames merged.
 #' @example inst/examples/merge_columns.R
+#' @importFrom dplyr bind_rows
 #' @importFrom rlang abort
 #' @export
 merge_columns <- function(
@@ -48,57 +50,20 @@ merge_columns <- function(
   if(missing(x)){
     abort("object x is missing with no default")
   }
-
-  if(!any(c("data.frame", "list") == class(x))){
+  if(!(inherits(x, "data.frame") | inherits(x, "list"))){
     abort("object x must be either a data.frame or a list")
   }
-
-  if(class(x) == "data.frame"){
+  if(inherits(x, "data.frame")){
     if(missing(y)){
-      abort("If x is a data.frame, then y must be supplied")
+      return(x)
+      # abort("If x is a data.frame, then y must be supplied")
+    }else{
+      x <- list(x, y)
     }
-    x <- list(x, y)
   }else{ # i.e. for lists
-    if(!all(unlist(lapply(x, class)) == "data.frame")){
+    if(!all(unlist(lapply(x, function(a){inherits(a, "data.frame")})))){
       abort("x must only contain data.frames")
     }
   }
-
-  x <- lapply(x, remove_factors)
-
-  col_names_all <- unique(unlist(lapply(x, colnames)))
-
-  result_list <- lapply(x, function(a, cn){
-    missing_names <- !(cn %in% colnames(a))
-    if(any(missing_names)){
-      new_names <- cn[missing_names]
-      result <- data.frame(
-        c(a, sapply(new_names, function(b){NA})),
-        stringsAsFactors = FALSE)
-      return(result[, cn])
-    }else{
-      return(a[, cn])
-    }
-  },
-  cn = col_names_all
-  )
-
-  return(do.call(rbind, result_list))
-
-}
-
-#' Remove factors from an object
-#'
-#' Internal functions called by merge_columns:
-#' @description This function converts factors to characters to avoid errors with
-#' levels.
-#' @param z A data.frame
-#' @return Returns the input data.frame with all factors converted to character.
-#' @noRd
-#' @keywords Internal
-remove_factors <- function(z){
-  z[] <- lapply(z, function(x){
-    if(is.factor(x)){as.character(x)}else{x}
-  })
-  return(z)
+  bind_rows(x)
 }
diff --git a/R/read_refs.R b/R/read_refs.R
index 47af8cd..f9f211c 100644
--- a/R/read_refs.R
+++ b/R/read_refs.R
@@ -95,6 +95,7 @@ read_refs <- function(
 #' @return Returns a data.frame or list of assembled search results.
 #' @importFrom rlang abort
 #' @importFrom rlang warn
+#' @importFrom tibble tibble
 #' @noRd
 #' @keywords Internal
 read_ref <- function(
@@ -141,7 +142,7 @@ read_ref <- function(
     }else{
       if(return_df){df <- as.data.frame.bibliography(df)}
     }
-    if(inherits(df, "data.frame")){df <- clean_df(df)}
+    if(inherits(df, "data.frame")){df <- tibble(clean_df(df))}
     if(verbose){cat("done\n")}
     return(df)
 
diff --git a/tests/testthat/test-clean.R b/tests/testthat/test-clean.R
index 7b9b0dd..26bad00 100644
--- a/tests/testthat/test-clean.R
+++ b/tests/testthat/test-clean.R
@@ -15,3 +15,17 @@ test_that("clean_colnames() works", {
   expect_false(any(duplicated(cleaned)))
   expect_false(any(grepl("^(X|Y|Z)\\.+", cleaned)))
 })
+
+test_that("clean_df() cleans authors, colnames and factors", {
+  test_df <- data.frame(
+    authors = c("Haddaway, N.R., A. Feirman AND M.J. Grainger", "Some authors"),
+    "..misc." = c("text", "text2"),
+    JOURNAL = as.factor(c("A journal", "Another journal"))
+  )
+  result <- clean_df(test_df)
+  expect_false(any(grepl("AND", result$authors)))
+  test_cols <- colnames(result)
+  expect_false(any(grepl("[[:punct::]]", test_cols)))
+  expect_equal(tolower(test_cols), test_cols)
+  expect_false(any(unlist(lapply(result, is.factor))))
+})
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
index 47088db..6ccbaa1 100644
--- a/tests/testthat/test-read.R
+++ b/tests/testthat/test-read.R
@@ -2,6 +2,7 @@ test_that("read_ref() works for simple imports", {
   df <- read_refs("testdata/eviatlas.txt",
                   return_df = TRUE,
                   verbose = FALSE)
+  expect_true(inherits(df, c("tbl", "data.frame")))
   expect_equal(nrow(df), 1)
   expect_true(any(grep("EviAtlas", df[1, ])))
 })
@@ -12,6 +13,7 @@ test_that("read_refs() works for simple imports", {
   df <- read_refs(testfiles,
                   return_df = TRUE,
                   verbose = FALSE)
+  expect_true(inherits(df, c("tbl", "data.frame")))
   expect_equal(nrow(df), 4)
   expect_true(any(grep("EviAtlas", df[1, ])))
   expect_true(any(grep("litsearchr", df[2, ])))