Skip to content

Commit

Permalink
Merge pull request #470 from massimoaria/develop
Browse files Browse the repository at this point in the history
Merge Collections form different DBs
  • Loading branch information
massimoaria authored Jun 14, 2024
2 parents a45ad5a + 5d8abdd commit f351bd4
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 57 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ importFrom(dplyr,any_of)
importFrom(dplyr,arrange)
importFrom(dplyr,as_tibble)
importFrom(dplyr,between)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,count)
importFrom(dplyr,cummean)
Expand Down
2 changes: 1 addition & 1 deletion R/csvOA2df.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID","corresponding_author_ids"))
utils::globalVariables(c("all_of", "corr", "DI", "C1","id_oa","RP","UN","AU_ID","corresponding_author_ids"))

csvOA2df <- function(file){
options(readr.num_columns = 0)
Expand Down
108 changes: 63 additions & 45 deletions R/mergeDbSources.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
#' Merge bibliographic data frames from SCOPUS and WoS
utils::globalVariables(c("num"))
#' Merge bibliographic data frames from supported bibliogtraphic DBs
#'
#' Merge bibliographic data frames from different databases (WoS and SCOPUS) into a single one.
#' Merge bibliographic data frames from different databases (WoS,SCOPUS, Lens, Openalex, etc-) into a single one.
#'
#' bibliographic data frames are obtained by the converting function \code{\link{convert2df}}.
#' The function merges data frames identifying common tag fields and duplicated records.
#'
#' @param ... are the bibliographic data frames to merge.
#' @param remove.duplicated is logical. If TRUE duplicated documents will be deleted from the bibliographic collection.
#' @param verbose is logical. If TRUE, information on duplicate documents is printed on the screen.
#' @return the value returned from \code{mergeDbSources} is a bibliographic data frame.
#'
#'
#' @examples
#'
#'
#' data(isiCollection, package = "bibliometrixData")
#'
#' data(scopusCollection, package = "bibliometrixData")
#'
#' M <- mergeDbSources(isiCollection, scopusCollection, remove.duplicated=TRUE)
#'
#' dim(M)
#'
#'
#'
#' @seealso \code{\link{convert2df}} to import and convert an ISI or SCOPUS Export file in a bibliographic data frame.
#' @seealso \code{\link{biblioAnalysis}} function for bibliometric analysis.
Expand All @@ -29,56 +31,72 @@
#' @export


mergeDbSources <- function(...,remove.duplicated=TRUE){
mergeDbSources <- function(...,remove.duplicated=TRUE, verbose=TRUE){

index <- NULL

###
L <- list(...)
mc <- match.call(expand.dots = TRUE)

n=length(L)
if (length(mc)>3){
M <- dplyr::bind_rows(list(...))
}else{
M <- dplyr::bind_rows(...)
}

Tags=names(L[[1]])

## identification of common tags
for (j in 2:n){
Tags=intersect(Tags,names(L[[j]]))
}
#####
M=data.frame(matrix(NA,1,length(Tags)))
names(M)=Tags
for (j in 1:n){
L[[j]]=L[[j]][,Tags]
dbLabels <- data.frame(DB = toupper(c("isi","scopus","openalex","lens","dimensions","pubmed","cochrane")),
num = c(1,2,3,4,5,6,7))
# order by db
M <- M %>%
left_join(dbLabels, by = "DB") %>%
arrange(num) %>%
select(-num) %>%
rename("CR_raw" = "CR") %>%
mutate(CR = "NA")


if (isTRUE(remove.duplicated)){
# remove by DOI
if ("DI" %in% names(M)){
index <- which(duplicated(M$DI) & !is.na(M$DI))
if (length(index)>0) M <- M[-index,]
}

M=rbind(M,L[[j]])
# remove by title
if ("TI" %in% names(M)){
TI <- gsub("[^[:alnum:] ]","",M$TI)
TI <- gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", TI, perl=TRUE)
d <- duplicated(paste(TI," ",M$PY))
if (isTRUE(verbose)) cat("\n",sum(d)+length(index),"duplicated documents have been removed\n")
M <- M[!d,]
}
}

## author data cleaning
if ("AU" %in% Tags){
M$AU=gsub(","," ",M$AU)
AUlist=strsplit(M$AU,";")
AU=lapply(AUlist,function(l){
l=trim(l)
name=strsplit(l," ")
lastname=unlist(lapply(name,function(ln){ln[1]}))
firstname=lapply(name,function(ln){
f=paste(substr(ln[-1],1,1),collapse=" ")
})
AU=paste(lastname,unlist(firstname),sep=" ",collapse=";")
return(AU)
})
M$AU=unlist(AU)
if (length(unique(M$DB))>1){
M$DB_Original <- M$DB
M$DB <- "ISI"

## author data cleaning
if ("AU" %in% names(M)){
M$AU <- gsub(","," ",M$AU)
AUlist <- strsplit(M$AU,";")
AU <- lapply(AUlist,function(l){
l <- trim(l)
name <- strsplit(l," ")
lastname <- unlist(lapply(name,function(ln){ln[1]}))
firstname <- lapply(name,function(ln){
f <- paste(substr(ln[-1],1,1),collapse=" ")
})
AU <- paste(lastname,unlist(firstname),sep=" ",collapse=";")
return(AU)
})
M$AU <- unlist(AU)

}
}
M=M[-1,]
M$DB="ISI"

if (isTRUE(remove.duplicated)){
M$TI=gsub("[^[:alnum:] ]","",M$TI)
M$TI=gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", M$TI, perl=TRUE)
d=duplicated(M$TI)
cat("\n",sum(d),"duplicated documents have been removed\n")
M=M[!d,]
}
M <- metaTagExtraction(M, "SR")
row.names(M) <- M$SR

class(M) <- c("bibliometrixDB", "data.frame")
M$SR <- row.names(M)
return(M)
}
1 change: 1 addition & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#' @import openalexR
#' @import ca
#' @importFrom dplyr %>%
#' @importFrom dplyr bind_cols
#' @importFrom dplyr across
#' @importFrom dplyr row_number
#' @importFrom dplyr tibble
Expand Down
125 changes: 122 additions & 3 deletions inst/biblioshiny/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ To ensure the functionality of Biblioshiny,
values$logo <- logo
values$logoGrid <- grid::rasterGrob(logo,interpolate = TRUE)
values$out <- NULL
values$loadMenu <- NA

### setting values
values$dpi <- 300
Expand Down Expand Up @@ -161,7 +162,18 @@ To ensure the functionality of Biblioshiny,
})

observeEvent(values$missTags, {
updateTabItems(session, "sidebarmenu", "loadData")
switch(values$loadMenu,
"load"={
updateTabItems(session, "sidebarmenu", "loadData")
},
"merge"={
updateTabItems(session, "sidebarmenu", "mergeData")
})
values$loadMenu <- NA
})

observeEvent(input$applyMerge, {
updateTabItems(session, "sidebarmenu", "mergeData")
})

## Load Menu ----
Expand Down Expand Up @@ -418,7 +430,9 @@ To ensure the functionality of Biblioshiny,
values$Histfield = "NA"
values$results = list("NA")
if (ncol(values$M)>1){values$rest_sidebar <- TRUE}
if (ncol(values$M)>1){showModal(missingModal(session))}
if (ncol(values$M)>1){
values$loadMenu <- "load"
showModal(missingModal(session))}
})

output$contents <- DT::renderDT({
Expand All @@ -440,6 +454,52 @@ To ensure the functionality of Biblioshiny,
columnShort=NULL, columnSmall=NULL, round=2, title="", button=FALSE, escape=FALSE, selection=FALSE, scrollX=TRUE)
})


## Merge Menu ----
DATAmerging<- eventReactive(input$applyMerge,{

inFile <- input$fileMerge

if (!is.null(inFile)){
M <- merge_files(inFile)
} else if (is.null(inFile)) {return(NULL)}

values = initial(values)
## remove not useful columns
ind <- which(substr(names(M),1,2)=="X.")
if (length(ind)>0) M <- M[,-ind]
##

values$M <- M
values$Morig = M
values$nMerge <- attr(M,"nMerge")
values$Histfield = "NA"
values$results = list("NA")
if (ncol(values$M)>1){values$rest_sidebar <- TRUE}
if (ncol(values$M)>1){
values$loadMenu <- "merge"
showModal(missingModal(session))}
})

output$contentsMerge <- DT::renderDT({
DATAmerging()
MData = as.data.frame(apply(values$M, 2, function(x) {
substring(x, 1, 150)
}))
MData$DOI <-
paste0(
'<a href=\"https://doi.org/',
MData$DI,
'\" target=\"_blank\">',
MData$DI,
'</a>'
)
nome = c("DOI", names(MData)[-length(names(MData))])
MData = MData[nome]
DTformat(MData, nrow=3, filename="Table", pagelength=TRUE, left=NULL, right=NULL, numeric=NULL, dom=TRUE, size='70%', filter="top",
columnShort=NULL, columnSmall=NULL, round=2, title="", button=FALSE, escape=FALSE, selection=FALSE, scrollX=TRUE)
})

### Missing Data in Metadata ----
output$missingDataTable <- DT::renderDT({
values$missingdf <- df <- missingData(values$M)$mandatoryTags
Expand Down Expand Up @@ -523,10 +583,22 @@ To ensure the functionality of Biblioshiny,

output$missingTitle <- renderUI({
ndocs <- nrow(values$M)
txt1 <- paste0("Completeness of bibliographic metadata - ", strong(ndocs)," documents from ", strong(firstup(values$M$DB[1])))
if ("DB_Original" %in% names(values$M)){
DB <- paste0(length(unique(values$M$DB_Original))," DBs")
txt1 <- paste0("Completeness of metadata -- ", strong(ndocs)," docs merged from ", DB)
txt2 <- paste0("Original size ",strong(values$nMerge), " docs -- Deleted ", strong(values$nMerge-ndocs), " duplicated docs")
} else {
DB <- firstup(values$M$DB[1])
txt1 <- paste0("Completeness of metadata -- ", strong(ndocs)," docs from ", strong(DB))
txt2 <- ""
}


tagList(
div(
h3(HTML(txt1)),
br(),
h4(HTML(txt2)),
style="text-align:center")
)
})
Expand Down Expand Up @@ -618,6 +690,53 @@ To ensure the functionality of Biblioshiny,
contentType = input$save_file
)

output$collection.saveMerge <- downloadHandler(
filename = function() {
paste("Bibliometrix-Export-File-", Sys.Date(), ".",input$save_fileMerge, sep="")
},
content <- function(file) {
tr <- FALSE
if ("CR" %in% names(values$M)) tr <- (sum(nchar(values$M$CR)>32767, na.rm=TRUE))>0

if (tr & input$save_file=="xlsx"){
show_alert(
text = tags$span(
tags$h4("Some documents have too long a list of references that cannot be saved in excel (>32767 characters).",
style = "color: firebrick;"),
tags$br(),
tags$h4("Data in the column CR could be truncated.",
style = "color: firebrick;")
),
#text = "Some documents have too long a list of references that cannot be saved in excel (>32767 characters).\nData in the column CR could be truncated",
title = "Please save the collection using the 'RData' format",
type = "warning",
width = "50%", ##NEW ----
closeOnEsc = TRUE,
closeOnClickOutside = TRUE,
html = FALSE,
showConfirmButton = TRUE,
showCancelButton = FALSE,
btn_labels = "OK",
btn_colors = "#913333",
timer = 0,
imageUrl = "",
animation = TRUE
)
suppressWarnings(openxlsx::write.xlsx(values$M, file=file))
} else {
switch(input$save_fileMerge,
xlsx={
suppressWarnings(openxlsx::write.xlsx(values$M, file=file))
},
RData={
M=values$M
save(M, file=file)
})
}
},
contentType = input$save_fileMerge
)

output$collection.save_api <- downloadHandler(
filename = function() {

Expand Down
Loading

0 comments on commit f351bd4

Please sign in to comment.