Skip to content

Commit

Permalink
change names of tweedie parameters to be consistent with R
Browse files Browse the repository at this point in the history
  • Loading branch information
actuaryzhang committed Mar 13, 2017
1 parent aeeb3f7 commit 4cffc40
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 19 deletions.
33 changes: 19 additions & 14 deletions R/pkg/R/mllib_regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,18 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' \code{Gamma}, \code{poisson} and \code{tweedie}.
#'
#' Note that there are two ways to specify the tweedie family.
#' a) Set \code{family = "tweedie"} and specify the variancePower and linkPower
#' a) Set \code{family = "tweedie"} and specify the var.power and link.power
#' b) When package \code{statmod} is loaded, the tweedie family is specified using the
#' family definition therein, i.e., \code{tweedie(var.power, link.power)}.
#' @param tol positive convergence tolerance of iterations.
#' @param maxIter integer giving the maximal number of IRLS iterations.
#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
#' weights as 1.0.
#' @param regParam regularization parameter for L2 regularization.
#' @param variancePower the power in the variance function of the Tweedie distribution which provides
#' @param var.power the power in the variance function of the Tweedie distribution which provides
#' the relationship between the variance and mean of the distribution. Only
#' applicable to the Tweedie family.
#' @param linkPower the index in the power link function. Only applicable to the Tweedie family.
#' @param link.power the index in the power link function. Only applicable to the Tweedie family.
#' @param ... additional arguments passed to the method.
#' @aliases spark.glm,SparkDataFrame,formula-method
#' @return \code{spark.glm} returns a fitted generalized linear model.
Expand Down Expand Up @@ -96,19 +96,24 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#'
#' # fit tweedie model
#' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
#' variancePower = 1.2, linkPower = 0)
#' var.power = 1.2, link.power = 0)
#' summary(model)
#'
#' # use the tweedie family from statmod
#' library(statmod)
#' model <- spark.glm(df, Freq ~ Sex + Age, family = tweedie(1.2, 0))
#' summary(model)
#' }
#' @note spark.glm since 2.0.0
#' @seealso \link{glm}, \link{read.ml}
setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) {
regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) {

if (is.character(family)) {
# Handle when family = "tweedie"
if (tolower(family) == "tweedie") {
family <- list(family = "tweedie", link = "linkNotUsed")
family <- list(family = "tweedie", link = NULL)
} else {
family <- get(family, mode = "function", envir = parent.frame())
}
Expand All @@ -122,9 +127,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
}
# Handle when family = statmod::tweedie()
if (tolower(family$family) == "tweedie" && !is.null(family$variance)) {
variancePower <- log(family$variance(exp(1)))
linkPower <- log(family$linkfun(exp(1)))
family <- list(family = "tweedie", link = "linkNotUsed")
var.power <- log(family$variance(exp(1)))
link.power <- log(family$linkfun(exp(1)))
family <- list(family = "tweedie", link = NULL)
}

formula <- paste(deparse(formula), collapse = "")
Expand All @@ -138,7 +143,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
"fit", formula, data@sdf, tolower(family$family), family$link,
tol, as.integer(maxIter), weightCol, regParam,
as.double(variancePower), as.double(linkPower))
as.double(var.power), as.double(link.power))
new("GeneralizedLinearRegressionModel", jobj = jobj)
})

Expand All @@ -158,8 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' weights as 1.0.
#' @param epsilon positive convergence tolerance of iterations.
#' @param maxit integer giving the maximal number of IRLS iterations.
#' @param variancePower the index of the power variance function in the Tweedie family.
#' @param linkPower the index of the power link function in the Tweedie family.
#' @param var.power the index of the power variance function in the Tweedie family.
#' @param link.power the index of the power link function in the Tweedie family.
#' @return \code{glm} returns a fitted generalized linear model.
#' @rdname glm
#' @export
Expand All @@ -175,9 +180,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' @seealso \link{spark.glm}
setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
variancePower = 0.0, linkPower = 1.0 - variancePower) {
var.power = 0.0, link.power = 1.0 - var.power) {
spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
variancePower = variancePower, linkPower = linkPower)
var.power = var.power, link.power = link.power)
})

# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
Expand Down
4 changes: 2 additions & 2 deletions R/pkg/inst/tests/testthat/test_mllib_regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ test_that("spark.glm and predict", {

# tweedie family
model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
family = "tweedie", variancePower = 1.2, linkPower = 0.0)
family = "tweedie", var.power = 1.2, link.power = 0.0)
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
Expand Down Expand Up @@ -269,7 +269,7 @@ test_that("glm and predict", {

# tweedie family
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
family = "tweedie", variancePower = 1.2, linkPower = 0.0)
family = "tweedie", var.power = 1.2, link.power = 0.0)
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
Expand Down
6 changes: 3 additions & 3 deletions R/pkg/vignettes/sparkr-vignettes.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ There are three ways to specify the `family` argument.
* Result returned by a family function, e.g. `family = poisson(link = log)`.

* Note that there are two ways to specify the tweedie family:
a) Set `family = "tweedie"` and specify the `variancePower` and `linkPower`
a) Set `family = "tweedie"` and specify the `var.power` and `link.power`
b) When package `statmod` is loaded, the tweedie family is specified using the family definition therein, i.e., `tweedie()`.

For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
Expand All @@ -702,13 +702,13 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))

The following is the same fit using the tweedie family:
```{r}
tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", variancePower = 0.0)
tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", var.power = 0.0)
summary(tweedieGLM1)
```
We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
```{r}
tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie",
variancePower = 1.2, linkPower = 0.0)
var.power = 1.2, link.power = 0.0)
summary(tweedieGLM2)
```

Expand Down

0 comments on commit 4cffc40

Please sign in to comment.