Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTF-8 in/out #118

Merged
merged 14 commits into from
Nov 23, 2020
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ Collate:
'openxlsxCoerce.R'
'readWorkbook.R'
'sheet_data_class.R'
'utils.R'
'workbook_column_widths.R'
'workbook_read_workbook.R'
'workbook_write_data.R'
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ get_letters <- function() {
.Call(`_openxlsx_get_letters`)
}

markUTF8 <- function(x, clone) {
.Call(`_openxlsx_markUTF8`, x, clone)
}

loadworksheets <- function(wb, styleObjects, xmlFiles, is_chart_sheet) {
.Call(`_openxlsx_loadworksheets`, wb, styleObjects, xmlFiles, is_chart_sheet)
}
Expand Down
5 changes: 2 additions & 3 deletions R/WorkbookClass.R
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ Workbook$methods(
# The result is saved to a new chart xml file
newfl <- file.path(dirname(fl), newname)
charts[newname] <<- newfl
chart <- readLines(fl, warn = FALSE, encoding = "UTF-8")
chart <- readUTF8(fl)
chart <-
gsub(
stri_join("(?<=')", sheet_names[[clonedSheet]], "(?='!)"),
Expand Down Expand Up @@ -17825,8 +17825,7 @@ Workbook$methods(
Workbook$methods(
loadStyles = function(stylesXML) {
## Build style objects from the styles XML
stylesTxt <-
readLines(stylesXML, warn = FALSE, encoding = "UTF-8")
stylesTxt <- readUTF8(stylesXML)
stylesTxt <- removeHeadTag(stylesTxt)

## Indexed colours
Expand Down
26 changes: 13 additions & 13 deletions R/loadWorkbook.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

## core
if (length(coreXML) == 1) {
coreXML <- paste(readLines(con = coreXML, encoding = "UTF-8", warn = FALSE), collapse = "")
coreXML <- paste(readUTF8(coreXML), collapse = "")
wb$core <- removeHeadTag(x = coreXML)
}

Expand All @@ -115,7 +115,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
worksheet_rId_mapping <- NULL
workbookRelsXML <- xmlFiles[grepl("workbook.xml.rels$", xmlFiles, perl = TRUE)]
if (length(workbookRelsXML) > 0) {
workbookRelsXML <- paste(readLines(con = workbookRelsXML, encoding = "UTF-8", warn = FALSE), collapse = "")
workbookRelsXML <- paste(readUTF8(workbookRelsXML), collapse = "")
workbookRelsXML <- getChildlessNode(xml = workbookRelsXML, tag = "<Relationship ")
worksheet_rId_mapping <- workbookRelsXML[grepl("worksheets/sheet", workbookRelsXML, fixed = TRUE)]
}
Expand All @@ -142,7 +142,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
## xl\
## xl\workbook
if (length(workbookXML) > 0) {
workbook <- readLines(workbookXML, warn = FALSE, encoding = "UTF-8")
workbook <- readUTF8(workbookXML)
workbook <- removeHeadTag(workbook)

sheets <- unlist(regmatches(workbook, gregexpr("(?<=<sheets>).*(?=</sheets>)", workbook, perl = TRUE)))
Expand Down Expand Up @@ -176,7 +176,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
for (i in 1:length(sheetrId)) {
if (is_chart_sheet[i]) {
count <- 0
txt <- paste(readLines(chartSheetsXML[j], warn = FALSE, encoding = "UTF-8"), collapse = "")
txt <- paste(readUTF8(chartSheetsXML[j]), collapse = "")

zoom <- regmatches(txt, regexpr('(?<=zoomScale=")[0-9]+', txt, perl = TRUE))
if (length(zoom) == 0) {
Expand Down Expand Up @@ -235,7 +235,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

## xl\sharedStrings
if (length(sharedStringsXML) > 0) {
sharedStrings <- readLines(sharedStringsXML, warn = FALSE, encoding = "UTF-8")
sharedStrings <- readUTF8(sharedStringsXML)
sharedStrings <- paste(sharedStrings, collapse = "\n")
sharedStrings <- removeHeadTag(sharedStrings)

Expand Down Expand Up @@ -419,7 +419,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

## xl\theme
if (length(themeXML) > 0) {
wb$theme <- removeHeadTag(paste(unlist(lapply(sort(themeXML)[[1]], function(x) readLines(x, warn = FALSE, encoding = "UTF-8"))), collapse = ""))
wb$theme <- removeHeadTag(paste(unlist(lapply(sort(themeXML)[[1]], readUTF8)), collapse = ""))
}


Expand Down Expand Up @@ -521,7 +521,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {

xml <- lapply(1:length(allRels), function(i) {
if (haveRels[i]) {
xml <- readLines(allRels[[i]], warn = FALSE, encoding = "UTF-8")
xml <- readUTF8(allRels[[i]])
xml <- removeHeadTag(xml)
xml <- gsub("<Relationships .*?>", "", xml)
xml <- gsub("</Relationships>", "", xml)
Expand Down Expand Up @@ -622,7 +622,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
tablesXML <- tablesXML[sprintf("table%s.xml", unlist(tables))]

## tables are now in correct order so we can read them in as they are
wb$tables <- sapply(tablesXML, function(x) removeHeadTag(paste(readLines(x, warn = FALSE), collapse = "")))
wb$tables <- sapply(tablesXML, function(x) removeHeadTag(paste(readUTF8(x), collapse = "")))

## pull out refs and attach names
refs <- regmatches(wb$tables, regexpr('(?<=ref=")[0-9A-Z:]+', wb$tables, perl = TRUE))
Expand Down Expand Up @@ -690,14 +690,14 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
hasDrawing <- sapply(drawXMLrelationship, length) > 0 ## which sheets have a drawing

if (length(drawingRelsXML) > 0) {
dRels <- lapply(drawingRelsXML, readLines, warn = FALSE)
dRels <- lapply(drawingRelsXML, readUTF8)
dRels <- unlist(lapply(dRels, removeHeadTag))
dRels <- gsub("<Relationships .*?>", "", dRels)
dRels <- gsub("</Relationships>", "", dRels)
}

if (length(drawingsXML) > 0) {
dXML <- lapply(drawingsXML, readLines, warn = FALSE, encoding = "UTF-8")
dXML <- lapply(drawingsXML, readUTF8)
dXML <- unlist(lapply(dXML, removeHeadTag))
dXML <- gsub("<xdr:wsDr .*?>", "", dXML)
dXML <- gsub("</xdr:wsDr>", "", dXML)
Expand Down Expand Up @@ -753,7 +753,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
ind <- grepl(target, vmlDrawingXML)

if (any(ind)) {
txt <- paste(readLines(vmlDrawingXML[ind], warn = FALSE), collapse = "\n")
txt <- paste(readUTF8(vmlDrawingXML[ind]), collapse = "\n")
txt <- removeHeadTag(txt)

i1 <- regexpr("<v:shapetype", txt, fixed = TRUE)
Expand Down Expand Up @@ -792,7 +792,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
ind <- grepl(target, vmlDrawingXML)

if (any(ind)) {
txt <- paste(readLines(vmlDrawingXML[ind], warn = FALSE), collapse = "\n")
txt <- paste(readUTF8(vmlDrawingXML[ind]), collapse = "\n")
txt <- removeHeadTag(txt)

cd <- unique(getNodes(xml = txt, tagIn = "<x:ClientData"))
Expand All @@ -803,7 +803,7 @@ loadWorkbook <- function(file, xlsxFile = NULL, isUnzipped = FALSE) {
target <- unlist(lapply(commentXMLrelationship[[i]], function(x) regmatches(x, gregexpr('(?<=Target=").*?"', x, perl = TRUE))[[1]]))
target <- basename(gsub('"$', "", target))

txt <- paste(readLines(commentsXML[grepl(target, commentsXML)], warn = FALSE), collapse = "\n")
txt <- paste(readUTF8(commentsXML[grepl(target, commentsXML)]), collapse = "\n")
txt <- removeHeadTag(txt)

authors <- getNodes(xml = txt, tagIn = "<author>")
Expand Down
6 changes: 3 additions & 3 deletions R/readWorkbook.R
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,14 @@ read.xlsx.default <- function(xlsxFile,

## get workbook names
workbookRelsXML <-
paste(readLines(workbookRelsXML, warn = FALSE, encoding = "UTF-8"),
paste(readUTF8(workbookRelsXML),
collapse = ""
)
workbookRelsXML <-
getChildlessNode(xml = workbookRelsXML, tag = "<Relationship ")

workbook <-
unlist(readLines(workbook, warn = FALSE, encoding = "UTF-8"))
unlist(readUTF8(workbook))
workbook <- removeHeadTag(workbook)

sheets <-
Expand Down Expand Up @@ -499,7 +499,7 @@ read.xlsx.default <- function(xlsxFile,
}

stylesXML <- xmlFiles[grepl("styles.xml", xmlFiles)]
styles <- readLines(stylesXML, warn = FALSE)
styles <- readUTF8(stylesXML)
styles <- removeHeadTag(styles)

## Number formats
Expand Down
4 changes: 4 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

readUTF8 <- function(x) {
readLines(x, warn = FALSE, encoding = "UTF-8")
}
8 changes: 3 additions & 5 deletions R/wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -2716,8 +2716,6 @@ createNamedRegion <- function(wb, sheet, cols, rows, name) {

if (tolower(name) %in% ex_names) {
stop(sprintf("Named region with name '%s' already exists!", name))
} else if (grepl("[^A-Z0-9_\\.]", name[1], ignore.case = TRUE)) {
stop("Invalid characters in name")
} else if (grepl("^[A-Z]{1,3}[0-9]+$", name)) {
stop("name cannot look like a cell reference.")
}
Expand Down Expand Up @@ -2800,7 +2798,7 @@ getNamedRegions.default <- function(x) {
xmlFiles <- unzip(x, exdir = xmlDir)

workbook <- xmlFiles[grepl("workbook.xml$", xmlFiles, perl = TRUE)]
workbook <- unlist(readLines(workbook, warn = FALSE, encoding = "UTF-8"))
workbook <- unlist(readUTF8(workbook))

dn <- getChildlessNode(xml = removeHeadTag(workbook), tag = "<definedName ")
if (length(dn) == 0) {
Expand Down Expand Up @@ -3269,7 +3267,7 @@ getDateOrigin <- function(xlsxFile) {
on.exit(unlink(xmlDir, recursive = TRUE), add = TRUE)

workbook <- xmlFiles[grepl("workbook.xml$", xmlFiles, perl = TRUE)]
workbook <- paste(unlist(readLines(workbook, warn = FALSE)), collapse = "")
workbook <- paste(unlist(readUTF8(workbook)), collapse = "")

if (grepl('date1904="1"|date1904="true"', workbook, ignore.case = TRUE)) {
origin <- "1904-01-01"
Expand Down Expand Up @@ -3312,7 +3310,7 @@ getSheetNames <- function(file) {
on.exit(unlink(xmlDir, recursive = TRUE), add = TRUE)

workbook <- xmlFiles[grepl("workbook.xml$", xmlFiles, perl = TRUE)]
workbook <- readLines(workbook, warn = FALSE, encoding = "UTF-8")
workbook <- readUTF8(workbook)
workbook <- removeHeadTag(workbook)
sheets <- unlist(regmatches(workbook, gregexpr("(?<=<sheets>).*(?=</sheets>)", workbook, perl = TRUE)))
sheets <- unlist(regmatches(sheets, gregexpr("<sheet[^>]*>", sheets, perl = TRUE)))
Expand Down
13 changes: 13 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// markUTF8
CharacterVector markUTF8(CharacterVector x, bool clone);
RcppExport SEXP _openxlsx_markUTF8(SEXP xSEXP, SEXP cloneSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type x(xSEXP);
Rcpp::traits::input_parameter< bool >::type clone(cloneSEXP);
rcpp_result_gen = Rcpp::wrap(markUTF8(x, clone));
return rcpp_result_gen;
END_RCPP
}
// loadworksheets
SEXP loadworksheets(Reference wb, List styleObjects, std::vector<std::string> xmlFiles, LogicalVector is_chart_sheet);
RcppExport SEXP _openxlsx_loadworksheets(SEXP wbSEXP, SEXP styleObjectsSEXP, SEXP xmlFilesSEXP, SEXP is_chart_sheetSEXP) {
Expand Down Expand Up @@ -456,6 +468,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_openxlsx_cppReadFile", (DL_FUNC) &_openxlsx_cppReadFile, 1},
{"_openxlsx_read_file_newline", (DL_FUNC) &_openxlsx_read_file_newline, 1},
{"_openxlsx_get_letters", (DL_FUNC) &_openxlsx_get_letters, 0},
{"_openxlsx_markUTF8", (DL_FUNC) &_openxlsx_markUTF8, 2},
{"_openxlsx_loadworksheets", (DL_FUNC) &_openxlsx_loadworksheets, 4},
{"_openxlsx_getNodes", (DL_FUNC) &_openxlsx_getNodes, 2},
{"_openxlsx_getOpenClosedNode", (DL_FUNC) &_openxlsx_getOpenClosedNode, 3},
Expand Down
16 changes: 16 additions & 0 deletions src/helper_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,19 @@ std::vector<std::string> get_letters(){
return(LETTERS);

}


// [[Rcpp::export]]
CharacterVector markUTF8(CharacterVector x, bool clone) {
CharacterVector out;
if (clone) {
out = Rcpp::clone(x);
} else {
out = x;
}
const size_t n = x.size();
for (size_t i = 0; i < n; ++i) {
out[i] = Rf_mkCharCE(x[i], CE_UTF8);
}
return out;
}
17 changes: 9 additions & 8 deletions src/load_workbook.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -778,8 +778,8 @@ SEXP getNodes(std::string xml, std::string tagIn){

}

return wrap(r) ;

CharacterVector out = wrap(r);
return markUTF8(out);
}


Expand Down Expand Up @@ -814,9 +814,8 @@ SEXP getOpenClosedNode(std::string xml, std::string open_tag, std::string close_

}

return wrap(r) ;


CharacterVector out = wrap(r);
return markUTF8(out);
}


Expand Down Expand Up @@ -852,7 +851,7 @@ SEXP getAttr(CharacterVector x, std::string tag){
}
}

return wrap(r) ;
return markUTF8(r); // no need to wrap as r is already a CharacterVector

}

Expand Down Expand Up @@ -912,7 +911,8 @@ CharacterVector getChildlessNode(std::string xml, std::string tag){

}

return wrap(r) ;
CharacterVector out = wrap(r);
return markUTF8(out);

}

Expand Down Expand Up @@ -960,7 +960,8 @@ CharacterVector get_extLst_Major(std::string xml){

}

return wrap(r) ;
CharacterVector out = wrap(r);
return markUTF8(out);

}

Expand Down
3 changes: 1 addition & 2 deletions src/openxlsx.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,4 @@ LogicalVector isInternalHyperlink(CharacterVector x);
// helper functions
string itos(int i);
SEXP write_file(std::string parent, std::string xmlText, std::string parentEnd, std::string R_fileName);


CharacterVector markUTF8(CharacterVector x, bool clone = false);
5 changes: 3 additions & 2 deletions src/write_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ SEXP buildMatrixMixed(CharacterVector v,


// If column is date class and no strings exist in column
if( (std::find(dateCols.begin(), dateCols.end(), i) != dateCols.end()) &
if( (std::find(dateCols.begin(), dateCols.end(), i) != dateCols.end()) &&
(std::find(charCols.begin(), charCols.end(), i) == charCols.end()) ){

// these are all dates and no characters --> safe to convert numerics
Expand Down Expand Up @@ -342,7 +342,8 @@ CharacterVector build_table_xml(std::string table, std::string tableStyleXML, st
table = table + tableCols + tableStyleXML + "</table>";


return wrap(table);
CharacterVector out = wrap(table);
return markUTF8(out);

}

Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test-writing_sheet_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ test_that("Writing sheetData rows XML - iris", {
openxlsx::write.xlsx(iris, temp_file)

unzip(temp_file, exdir = tempdir())
x <- readLines(file.path(tempdir(), "xl", "worksheets", "sheet1.xml"), warn = FALSE, encoding = "UTF-8")
x <- readUTF8(file.path(tempdir(), "xl", "worksheets", "sheet1.xml"))
rows <- unlist(regmatches(x = x, gregexpr("<row.*?</row>", x)))

expected_rows <- c(
Expand Down Expand Up @@ -187,7 +187,7 @@ test_that("Writing sheetData rows XML - mtcars", {
openxlsx::write.xlsx(mtcars, temp_file, row.names = TRUE)

unzip(temp_file, exdir = tempdir())
x <- readLines(file.path(tempdir(), "xl", "worksheets", "sheet1.xml"), warn = FALSE, encoding = "UTF-8")
x <- readUTF8(file.path(tempdir(), "xl", "worksheets", "sheet1.xml"))
rows <- unlist(regmatches(x = x, gregexpr("<row.*?</row>", x)))

expected_rows <- c(
Expand Down