Skip to content

Commit

Permalink
Merge pull request #455 from databrickslabs/r/fix/312sparklyrapi
Browse files Browse the repository at this point in the history
R package fixes & improvements
  • Loading branch information
Milos Colic authored Nov 29, 2023
2 parents 1952d3b + 5674422 commit c4cce0f
Show file tree
Hide file tree
Showing 14 changed files with 770 additions and 172 deletions.
15 changes: 12 additions & 3 deletions .github/actions/r_build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ runs:
name: Download and unpack Spark
shell: bash
run: |
wget -P /usr/spark-download/raw https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz
tar zxvf /usr/spark-download/raw/spark-3.2.1-bin-hadoop2.7.tgz -C /usr/spark-download/unzipped
wget -P /usr/spark-download/raw https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/spark-${{ matrix.spark }}-bin-hadoop3.tgz
tar zxvf /usr/spark-download/raw/spark-${{ matrix.spark }}-bin-hadoop3.tgz -C /usr/spark-download/unzipped
- name: Create R environment
shell: bash
run: |
Expand All @@ -50,16 +50,25 @@ runs:
run: |
cd R
Rscript --vanilla generate_docs.R
env:
SPARK_HOME: /usr/spark-download/unzipped/spark-${{ matrix.spark }}-bin-hadoop3
- name: Build R package
shell: bash
run: |
cd R
Rscript --vanilla build_r_package.R
- name: Test R package
env:
SPARK_HOME: /usr/spark-download/unzipped/spark-${{ matrix.spark }}-bin-hadoop3
- name: Test SparkR package
shell: bash
run: |
cd R/sparkR-mosaic
Rscript --vanilla tests.R
- name: Test sparklyr package
shell: bash
run: |
cd R/sparklyr-mosaic
Rscript --vanilla tests.R
- name: Copy R artifacts to GH Actions run
shell: bash
run: |
Expand Down
1 change: 1 addition & 0 deletions R/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
**/.Rhistory
**/*.tar.gz
/sparklyr-mosaic/metastore_db/
9 changes: 1 addition & 8 deletions R/build_r_package.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7"
Sys.setenv(SPARK_HOME = spark_location)

spark_location <- Sys.getenv("SPARK_HOME")
library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib")))


library(pkgbuild)
library(sparklyr)



build_mosaic_bindings <- function(){
## build package
Expand Down
95 changes: 50 additions & 45 deletions R/generate_R_bindings.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ library(methods)

parser <- function(x){
#split on left bracket to get name
splitted = strsplit(x, "(", fixed=T)[[1]]
splitted <- strsplit(x, "(", fixed=T)[[1]]
# extract function name
function_name = splitted[1]
function_name <- splitted[1]
# remove the trailing bracket
args = gsub( ")", '',splitted[2], fixed=T)
args = strsplit(args, ", ", fixed=T)[[1]]
args = lapply(args, function(x){strsplit(x, ": ", fixed=T)}[[1]])
output = list(
args <- gsub( ")", '',splitted[2], fixed=T)
args <- strsplit(args, ", ", fixed=T)[[1]]
args <- lapply(args, function(x){strsplit(x, ": ", fixed=T)}[[1]])
output <- list(
"function_name" = function_name
,"args"=args
)
Expand All @@ -24,8 +24,8 @@ parser <- function(x){

############################################################
build_generic <- function(input){
function_name = input$function_name
args = lapply(input$args, function(x){x[1]})
function_name <- input$function_name
args <- lapply(input$args, function(x){x[1]})
paste0(
'#\' @rdname ', function_name, '
setGeneric(
Expand All @@ -35,21 +35,9 @@ build_generic <- function(input){
')
}


build_generic2 <- function(input){
function_name = input$function_name
args = lapply(input$args, function(x){x[1]})
paste0(
'#\' @rdname ', function_name, '
setGeneric(
name="',function_name,'"
,def=function(',paste0(args, collapse=','), ') {standardGeneric("',function_name, '")}
)
')
}
############################################################
build_column_specifiers <- function(input){
args = lapply(input$args, function(x){x[1]})
args <- lapply(input$args, function(x){x[1]})
build_column_specifier <- function(arg){
return(paste0(arg, '@jc'))
}
Expand All @@ -62,29 +50,32 @@ build_column_specifiers <- function(input){
}
############################################################
build_method<-function(input){
function_name = input$function_name
arg_names = lapply(input$args, function(x){c(x[1])})
function_name <- input$function_name
arg_names <- lapply(input$args, function(x){c(x[1])})
#this handles converting non-Column arguments to their R equivalents
argument_parser <- function(x){
if(x[2] == 'Int'){
x[2] = "numeric"
x[2] <- "numeric"
}
else if(x[2] == 'String'){
x[2] = "character"
x[2] <- "character"
}
else if(x[2] == 'Double'){
x[2] = "numeric"
x[2] <- "numeric"
}
else if(x[2] == 'Boolean') {
x[2] <- "logical"
}
x
}
# convert scala type to R types
args = lapply(input$args, argument_parser)
args <- lapply(input$args, argument_parser)
# take a copy for building the docs
param_args = args
param_args <- args
# wrap the strings in speech marks
args = lapply(args, function(x){c(x[1], paste0("'", x[2], "'"))})
args <- lapply(args, function(x){c(x[1], paste0("'", x[2], "'"))})
# collapse down to a single string
args = lapply(args, function(x){paste0(x, collapse= ' = ')})
args <- lapply(args, function(x){paste0(x, collapse= ' = ')})
column_specifiers <- build_column_specifiers(input)
docstring <- paste0(
c(paste0(c("#'", function_name), collapse=" "),
Expand Down Expand Up @@ -116,48 +107,62 @@ build_method<-function(input){
############################################################
get_function_names <- function(scala_file_path){
#scala_file_path = "~/Documents/mosaic/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala"
scala_file_object = file(scala_file_path)
scala_file_object <- file(scala_file_path)

scala_file = readLines(scala_file_object)
scala_file <- readLines(scala_file_object)
closeAllConnections()
# find where the methods start
start_string = " object functions extends Serializable {"
start_index = grep(start_string, scala_file, fixed=T) + 1
start_string <- " object functions extends Serializable {"
start_index <- grep(start_string, scala_file, fixed=T) + 1
# find the methods end - will be the next curly bracket
# need to find where the matching end brace for the start string is located.
# counter starts at 1 as the start string includes the opening brace
brace_counter = 1
brace_counter <- 1

for(i in start_index : length(scala_file)){
# split the string into characters - returns a list so unlist it
line_characters <- unlist(strsplit(scala_file[i], ''))
# count the number of brace opens
n_opens = sum(grepl("{", line_characters, fixed=T))
n_opens <- sum(grepl("{", line_characters, fixed=T))
# count the number of brace closes
n_closes = sum(grepl("}", line_characters, fixed=T))
n_closes <- sum(grepl("}", line_characters, fixed=T))
# update the counter
brace_counter <- brace_counter + n_opens - n_closes
if (brace_counter == 0) break

}
methods_to_bind = scala_file[start_index:i]
methods_to_bind <- scala_file[start_index:i]
# remove any line that doesn't start with def
def_mask = grepl('\\s+def .*', methods_to_bind)
methods_to_bind = methods_to_bind[def_mask]
def_mask <- grepl('\\s+def .*', methods_to_bind)
methods_to_bind <- methods_to_bind[def_mask]
# parse the string to get just the function_name(input:type...) pattern
methods_to_bind = unlist(lapply(methods_to_bind, function(x){
methods_to_bind <- unlist(lapply(methods_to_bind, function(x){
substr(x
, regexpr("def ", x, fixed=T)[1]+4 # get the starting point to account for whitespace
, regexpr("): ", x, fixed=T)[1] # get the end point of where the return is.
)
}
))
sort(methods_to_bind, T)
sort_methods_by_argcount(methods_to_bind)
}

############################################################
sort_methods_by_argcount <- function(methods) {
# Split the strings by colon and calculate the number of colons
method_names <- sapply(strsplit(methods, "\\("), function(x) x[1])
argcount <- sapply(strsplit(methods, ","), function(x) length(x) - 1)

# Use the order function to sort first alphabetically and then by the number of colons
order_indices <- order(method_names, argcount)

# Return the sorted list
methods_sorted <- methods[order_indices]
return(methods_sorted)
}

############################################################
build_sparklyr_mosaic_function <- function(input){
function_name = input$function_name
function_name <- input$function_name
paste0(

"#' ", function_name, "\n\n",
Expand Down Expand Up @@ -191,7 +196,7 @@ main <- function(scala_file_path){
##########################
##########################
# build sparkr functions
function_data = get_function_names(scala_file_path)
function_data <- get_function_names(scala_file_path)
parsed <- lapply(function_data, parser)


Expand Down Expand Up @@ -223,9 +228,9 @@ main <- function(scala_file_path){
# supplementary files
sparkr_supplementary_files <- c("sparklyr-mosaic/enableMosaic.R", "sparklyr-mosaic/sparkFunctions.R")
copy_supplementary_file(sparkr_supplementary_files, "sparklyr-mosaic/sparklyrMosaic/R/")

}


args <- commandArgs(trailingOnly = T)
if (length(args) != 1){
stop("Please provide the MosaicContext.scala file path to generate_sparkr_functions.R")
Expand Down
4 changes: 1 addition & 3 deletions R/generate_docs.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7"
Sys.setenv(SPARK_HOME = spark_location)

spark_location <- Sys.getenv("SPARK_HOME")
library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib")))
library(roxygen2)

Expand Down
4 changes: 1 addition & 3 deletions R/install_deps.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
options(repos = c(CRAN = "https://packagemanager.posit.co/cran/__linux__/focal/latest"))

install.packages("pkgbuild")
install.packages("roxygen2")
install.packages("sparklyr")
install.packages(c("pkgbuild", "testthat", "roxygen2", "sparklyr"))
Loading

0 comments on commit c4cce0f

Please sign in to comment.