Merge pull request #470 from massimoaria/develop

Merge Collections form different DBs
massimoaria · Jun 14, 2024 · f351bd4 · f351bd4
2 parents a45ad5a + 5d8abdd
commit f351bd4
Show file tree

Hide file tree

Showing 8 changed files with 316 additions and 57 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -179,6 +179,7 @@ importFrom(dplyr,any_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,between)
+importFrom(dplyr,bind_cols)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,count)
 importFrom(dplyr,cummean)

diff --git a/R/csvOA2df.R b/R/csvOA2df.R
@@ -1,4 +1,4 @@
-utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID","corresponding_author_ids"))
+utils::globalVariables(c("all_of", "corr", "DI", "C1","id_oa","RP","UN","AU_ID","corresponding_author_ids"))
 
 csvOA2df <- function(file){
   options(readr.num_columns = 0)

diff --git a/R/mergeDbSources.R b/R/mergeDbSources.R
@@ -1,25 +1,27 @@
-#' Merge bibliographic data frames from SCOPUS and WoS  
+utils::globalVariables(c("num"))
+#' Merge bibliographic data frames from supported bibliogtraphic DBs  
 #'
-#' Merge bibliographic data frames from different databases (WoS and SCOPUS) into a single one.
+#' Merge bibliographic data frames from different databases (WoS,SCOPUS, Lens, Openalex, etc-) into a single one.
 #' 
 #' bibliographic data frames are obtained by the converting function \code{\link{convert2df}}. 
 #' The function merges data frames identifying common tag fields and duplicated records.
 #'
 #' @param ... are the bibliographic data frames to merge.
 #' @param remove.duplicated is logical. If TRUE duplicated documents will be deleted from the bibliographic collection.
+#' @param verbose is logical.  If TRUE, information on duplicate documents is printed on the screen.
 #' @return the value returned from \code{mergeDbSources} is a bibliographic data frame.
 #'
 #'
 #' @examples
-#'  
+#'
 #' data(isiCollection, package = "bibliometrixData")
 #' 
 #' data(scopusCollection, package = "bibliometrixData")
 #' 
 #' M <- mergeDbSources(isiCollection, scopusCollection, remove.duplicated=TRUE)
 #' 
 #' dim(M)
-#'
+#' 
 #'
 #' @seealso \code{\link{convert2df}} to import and convert an ISI or SCOPUS Export file in a bibliographic data frame.
 #' @seealso \code{\link{biblioAnalysis}} function for bibliometric analysis.
@@ -29,56 +31,72 @@
 #' @export
 
 
-mergeDbSources <- function(...,remove.duplicated=TRUE){
+mergeDbSources <- function(...,remove.duplicated=TRUE, verbose=TRUE){
+
+  index <- NULL
 
-###
-  L <- list(...)
+  mc <- match.call(expand.dots = TRUE)
 
-  n=length(L)
+  if (length(mc)>3){
+      M <- dplyr::bind_rows(list(...))
+    }else{
+      M <- dplyr::bind_rows(...)
+    }
 
-  Tags=names(L[[1]])
-
-  ## identification of common tags
-  for (j in 2:n){
-    Tags=intersect(Tags,names(L[[j]]))
-  }
-  #####
-  M=data.frame(matrix(NA,1,length(Tags)))
-  names(M)=Tags
-  for (j in 1:n){
-    L[[j]]=L[[j]][,Tags]
+   dbLabels <- data.frame(DB = toupper(c("isi","scopus","openalex","lens","dimensions","pubmed","cochrane")),
+                          num = c(1,2,3,4,5,6,7))
+   # order by db
+   M <- M %>% 
+     left_join(dbLabels, by = "DB") %>% 
+     arrange(num) %>% 
+     select(-num) %>% 
+     rename("CR_raw" = "CR") %>% 
+     mutate(CR = "NA")
+
+
+  if (isTRUE(remove.duplicated)){
+    # remove by DOI
+    if ("DI" %in% names(M)){
+      index <- which(duplicated(M$DI) & !is.na(M$DI))
+      if (length(index)>0) M <- M[-index,]
+    }
 
-    M=rbind(M,L[[j]])
+    # remove by title
+    if ("TI" %in% names(M)){
+      TI <- gsub("[^[:alnum:] ]","",M$TI)
+      TI <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", TI, perl=TRUE)
+      d <- duplicated(paste(TI," ",M$PY))
+      if (isTRUE(verbose)) cat("\n",sum(d)+length(index),"duplicated documents have been removed\n")
+      M <- M[!d,]
+    }
   }
 
-  ## author data cleaning
-  if ("AU" %in% Tags){
-    M$AU=gsub(","," ",M$AU)
-    AUlist=strsplit(M$AU,";")
-    AU=lapply(AUlist,function(l){
-      l=trim(l)
-      name=strsplit(l," ")
-      lastname=unlist(lapply(name,function(ln){ln[1]}))
-      firstname=lapply(name,function(ln){
-        f=paste(substr(ln[-1],1,1),collapse=" ")
-        })
-      AU=paste(lastname,unlist(firstname),sep=" ",collapse=";")
-      return(AU)
-    })
-    M$AU=unlist(AU)
+  if (length(unique(M$DB))>1){
+    M$DB_Original <- M$DB
+    M$DB <- "ISI"
 
+    ## author data cleaning
+    if ("AU" %in% names(M)){
+      M$AU <- gsub(","," ",M$AU)
+      AUlist <- strsplit(M$AU,";")
+      AU <- lapply(AUlist,function(l){
+        l <- trim(l)
+        name <- strsplit(l," ")
+        lastname <- unlist(lapply(name,function(ln){ln[1]}))
+        firstname <- lapply(name,function(ln){
+          f <- paste(substr(ln[-1],1,1),collapse=" ")
+        })
+        AU <- paste(lastname,unlist(firstname),sep=" ",collapse=";")
+        return(AU)
+      })
+      M$AU <- unlist(AU)
+
+    }
   }
-  M=M[-1,]
-  M$DB="ISI"
 
-  if (isTRUE(remove.duplicated)){
-    M$TI=gsub("[^[:alnum:] ]","",M$TI)
-    M$TI=gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", M$TI, perl=TRUE)
-    d=duplicated(M$TI)
-    cat("\n",sum(d),"duplicated documents have been removed\n")
-    M=M[!d,]
-  }
+  M <- metaTagExtraction(M, "SR")
+  row.names(M) <- M$SR
+
   class(M) <- c("bibliometrixDB", "data.frame")
-  M$SR <- row.names(M)
   return(M)
 }
diff --git a/R/zzz.R b/R/zzz.R
@@ -18,6 +18,7 @@
 #' @import openalexR
 #' @import ca
 #' @importFrom dplyr %>%
+#' @importFrom dplyr bind_cols
 #' @importFrom dplyr across
 #' @importFrom dplyr row_number
 #' @importFrom dplyr tibble

diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R
@@ -56,6 +56,7 @@ To ensure the functionality of Biblioshiny,
   values$logo <- logo
   values$logoGrid <- grid::rasterGrob(logo,interpolate = TRUE)
   values$out <- NULL
+  values$loadMenu <- NA
 
   ### setting values
   values$dpi <- 300
@@ -161,7 +162,18 @@ To ensure the functionality of Biblioshiny,
   })
 
   observeEvent(values$missTags, {
-    updateTabItems(session, "sidebarmenu", "loadData")
+    switch(values$loadMenu,
+           "load"={
+             updateTabItems(session, "sidebarmenu", "loadData")
+           },
+           "merge"={
+             updateTabItems(session, "sidebarmenu", "mergeData")
+           })
+    values$loadMenu <- NA
+  })
+
+  observeEvent(input$applyMerge, {
+    updateTabItems(session, "sidebarmenu", "mergeData")
   })
 
   ## Load Menu ----
@@ -418,7 +430,9 @@ To ensure the functionality of Biblioshiny,
     values$Histfield = "NA"
     values$results = list("NA")
     if (ncol(values$M)>1){values$rest_sidebar <- TRUE}
-    if (ncol(values$M)>1){showModal(missingModal(session))}
+    if (ncol(values$M)>1){
+      values$loadMenu <- "load"
+      showModal(missingModal(session))}
   })
 
   output$contents <- DT::renderDT({
@@ -440,6 +454,52 @@ To ensure the functionality of Biblioshiny,
              columnShort=NULL, columnSmall=NULL, round=2, title="", button=FALSE, escape=FALSE, selection=FALSE, scrollX=TRUE)
   })
 
+
+  ## Merge Menu ----
+  DATAmerging<- eventReactive(input$applyMerge,{
+
+    inFile <- input$fileMerge
+
+    if (!is.null(inFile)){
+      M <- merge_files(inFile)
+    } else if (is.null(inFile)) {return(NULL)}
+
+    values = initial(values)
+    ## remove not useful columns
+    ind <- which(substr(names(M),1,2)=="X.")
+    if (length(ind)>0) M <- M[,-ind]
+    ##
+
+    values$M <- M
+    values$Morig = M
+    values$nMerge <- attr(M,"nMerge")
+    values$Histfield = "NA"
+    values$results = list("NA")
+    if (ncol(values$M)>1){values$rest_sidebar <- TRUE}
+    if (ncol(values$M)>1){
+      values$loadMenu <- "merge"
+      showModal(missingModal(session))}
+  })
+
+  output$contentsMerge <- DT::renderDT({
+    DATAmerging()   
+    MData = as.data.frame(apply(values$M, 2, function(x) {
+      substring(x, 1, 150)
+    }))
+    MData$DOI <-
+      paste0(
+        '<a href=\"https://doi.org/',
+        MData$DI,
+        '\" target=\"_blank\">',
+        MData$DI,
+        '</a>'
+      )
+    nome = c("DOI", names(MData)[-length(names(MData))])
+    MData = MData[nome]
+    DTformat(MData, nrow=3, filename="Table", pagelength=TRUE, left=NULL, right=NULL, numeric=NULL, dom=TRUE, size='70%', filter="top",
+             columnShort=NULL, columnSmall=NULL, round=2, title="", button=FALSE, escape=FALSE, selection=FALSE, scrollX=TRUE)
+  })
+
   ### Missing Data in Metadata ----
   output$missingDataTable <- DT::renderDT({
     values$missingdf <- df <- missingData(values$M)$mandatoryTags
@@ -523,10 +583,22 @@ To ensure the functionality of Biblioshiny,
 
   output$missingTitle <- renderUI({
     ndocs <- nrow(values$M)
-    txt1 <- paste0("Completeness of bibliographic metadata - ", strong(ndocs)," documents from ", strong(firstup(values$M$DB[1])))
+    if ("DB_Original" %in% names(values$M)){
+      DB <- paste0(length(unique(values$M$DB_Original))," DBs")
+      txt1 <- paste0("Completeness of metadata -- ", strong(ndocs)," docs merged from ", DB)
+      txt2 <- paste0("Original size ",strong(values$nMerge), " docs -- Deleted ", strong(values$nMerge-ndocs), " duplicated docs")
+    } else {
+      DB <- firstup(values$M$DB[1])
+      txt1 <- paste0("Completeness of metadata -- ", strong(ndocs)," docs from ", strong(DB))
+      txt2 <- ""
+    }
+
+
     tagList(
       div(
         h3(HTML(txt1)),
+        br(),
+        h4(HTML(txt2)),
         style="text-align:center")
     )
   })
@@ -618,6 +690,53 @@ To ensure the functionality of Biblioshiny,
     contentType = input$save_file
   )
 
+  output$collection.saveMerge <- downloadHandler(
+    filename = function() {
+      paste("Bibliometrix-Export-File-", Sys.Date(), ".",input$save_fileMerge, sep="")
+    },
+    content <- function(file) {
+      tr <- FALSE
+      if ("CR" %in% names(values$M)) tr <- (sum(nchar(values$M$CR)>32767, na.rm=TRUE))>0
+
+      if (tr & input$save_file=="xlsx"){
+        show_alert(
+          text = tags$span(
+            tags$h4("Some documents have too long a list of references that cannot be saved in excel (>32767 characters).",
+                    style = "color: firebrick;"),
+            tags$br(),
+            tags$h4("Data in the column CR could be truncated.",
+                    style = "color: firebrick;")
+          ),
+          #text = "Some documents have too long a list of references that cannot be saved in excel (>32767 characters).\nData in the column CR could be truncated",
+          title = "Please save the collection using the 'RData' format",
+          type = "warning",
+          width =  "50%", ##NEW ----
+          closeOnEsc = TRUE,
+          closeOnClickOutside = TRUE,
+          html = FALSE,
+          showConfirmButton = TRUE,
+          showCancelButton = FALSE,
+          btn_labels = "OK",
+          btn_colors = "#913333",
+          timer = 0,
+          imageUrl = "",
+          animation = TRUE
+        )
+        suppressWarnings(openxlsx::write.xlsx(values$M, file=file))
+      } else {
+        switch(input$save_fileMerge,
+               xlsx={
+                 suppressWarnings(openxlsx::write.xlsx(values$M, file=file))
+                 },
+               RData={
+                 M=values$M
+                 save(M, file=file)
+               })
+      }
+    },
+    contentType = input$save_fileMerge
+  )
+
   output$collection.save_api <- downloadHandler(
     filename = function() {