ARROW-6649: [R] print methods for Array, ChunkedArray, Table, RecordB…

…atch See the readme and the new tests for example output. This patch also fixes a validation bug in `dictionary()`, aliases that to `DictionaryType$create`, and adds default arguments. Closes #5492 from nealrichardson/print-methods and squashes the following commits: c092d30 <Neal Richardson> Merge branch 'print-methods' of github.com:nealrichardson/arrow into print-methods 02afb89 <Neal Richardson> Prettier printing of dictionary type's ordered attribute 5750100 <Neal Richardson> indices in the docs too 6be0328 <Neal Richardson> indices 2d4e744 <Neal Richardson> Add/improve print methods for Array, ChunkedArray, Table, RecordBatch Authored-by: Neal Richardson <[email protected]> Signed-off-by: Neal Richardson <[email protected]>
apache · Sep 25, 2019 · a89c803 · a89c803
1 parent 2c7fb24
commit a89c803
Show file tree

Hide file tree

Showing 19 changed files with 209 additions and 55 deletions.
diff --git a/r/R/array.R b/r/R/array.R
@@ -74,7 +74,10 @@ Array <- R6Class("Array",
     ApproxEquals = function(other) Array__ApproxEquals(self, other),
     data = function() shared_ptr(ArrayData, Array__data(self)),
     as_vector = function() Array__as_vector(self),
-    ToString = function() Array__ToString(self),
+    ToString = function() {
+      typ <- paste0("<", self$type$ToString(), ">")
+      paste(typ, Array__ToString(self), sep = "\n")
+    },
     Slice = function(offset, length = NULL){
       if (is.null(length)) {
         shared_ptr(Array, Array__Slice1(self, offset))

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -54,15 +54,18 @@ Object <- R6Class("Object",
       self$`.:xp:.` <- xp
     },
     print = function(...){
-      cat(class(self)[[1]], "\n")
+      cat(class(self)[[1]], "\n", sep = "")
       if (!is.null(self$ToString)){
-        cat(self$ToString(), "\n")
+        cat(self$ToString(), "\n", sep = "")
       }
       invisible(self)
     }
   )
 )
 
+#' @export
+`!=.Object` <- function(lhs, rhs) !(lhs == rhs)
+
 shared_ptr <- function(class, xp) {
   if (!shared_ptr_is_null(xp)) class$new(xp)
 }

diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R
@@ -69,6 +69,20 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = Object,
     },
     Validate = function() {
       ChunkedArray__Validate(self)
+    },
+    ToString = function() {
+      out <- self$chunk(0)$ToString()
+      if (self$num_chunks > 1) {
+        # Regardless of whether the first array prints with ellipsis, we need
+        # to ellipsize because there's more data than is contained in this
+        # chunk
+        if (grepl("...\n", out, fixed = TRUE)) {
+          out <- sub("\\.\\.\\..*$", "...\n]", out)
+        } else {
+          out <- sub("\\n\\]$", ",\n  ...\n]", out)
+        }
+      }
+      out
     }
   ),
   active = list(

diff --git a/r/R/dictionary.R b/r/R/dictionary.R
@@ -31,28 +31,39 @@
 #' @name DictionaryType
 DictionaryType <- R6Class("DictionaryType",
   inherit = FixedWidthType,
-
+  public = list(
+    ToString = function() {
+      prettier_dictionary_type(DataType__ToString(self))
+    }
+  ),
   active = list(
     index_type = function() DataType$create(DictionaryType__index_type(self)),
     value_type = function() DataType$create(DictionaryType__value_type(self)),
     name = function() DictionaryType__name(self),
     ordered = function() DictionaryType__ordered(self)
   )
 )
+DictionaryType$create <- function(index_type = int32(),
+                                  value_type = utf8(),
+                                  ordered = FALSE) {
+  assert_is(index_type, "DataType")
+  assert_is(value_type, "DataType")
+  shared_ptr(DictionaryType, DictionaryType__initialize(index_type, value_type, ordered))
+}
 
 #' Create a dictionary type
 #'
-#' @param index_type index type, e.g. [int32()]
-#' @param value_type value type, probably [utf8()]
-#' @param ordered Is this an ordered dictionary ?
+#' @param index_type A DataType for the indices (default [int32()])
+#' @param value_type A DataType for the values (default [utf8()])
+#' @param ordered Is this an ordered dictionary (default `FALSE`)?
 #'
 #' @return A [DictionaryType]
 #' @seealso [Other Arrow data types][data-type]
 #' @export
-dictionary <- function(index_type, value_type, ordered = FALSE) {
-  assert_that(
-    inherits(index_type, "DataType"),
-    inherits(index_type, "DataType")
-  )
-  shared_ptr(DictionaryType, DictionaryType__initialize(index_type, value_type, ordered))
+dictionary <- DictionaryType$create
+
+prettier_dictionary_type <- function(x) {
+  # Prettier format the "ordered" attribute
+  x <- sub(", ordered=0", "", x)
+  sub("ordered=1", "ordered", x)
 }
diff --git a/r/R/field.R b/r/R/field.R
@@ -34,7 +34,7 @@
 Field <- R6Class("Field", inherit = Object,
   public = list(
     ToString = function() {
-      Field__ToString(self)
+      prettier_dictionary_type(Field__ToString(self))
     },
     Equals = function(other) {
       inherits(other, "Field") && Field__Equals(self, other)

diff --git a/r/R/record-batch.R b/r/R/record-batch.R
@@ -111,6 +111,7 @@ RecordBatch <- R6Class("RecordBatch", inherit = Object,
     },
 
     serialize = function() ipc___SerializeRecordBatch__Raw(self),
+    ToString = function() ToString_tabular(self),
 
     cast = function(target_schema, safe = TRUE, options = cast_options(safe)) {
       assert_is(target_schema, "Schema")
@@ -246,3 +247,11 @@ tail.RecordBatch <- function(x, n = 6L, ...) {
   }
   x$Slice(n)
 }
+
+ToString_tabular <- function(x, ...) {
+  # Generic to work with both RecordBatch and Table
+  sch <- unlist(strsplit(x$schema$ToString(), "\n"))
+  sch <- sub("(.*): (.*)", "$\\1 <\\2>", sch)
+  dims <- sprintf("%s rows x %s columns", nrow(x), ncol(x))
+  paste(c(dims, sch), collapse = "\n")
+}
diff --git a/r/R/schema.R b/r/R/schema.R
@@ -49,7 +49,7 @@
 Schema <- R6Class("Schema",
   inherit = Object,
   public = list(
-    ToString = function() Schema__ToString(self),
+    ToString = function() prettier_dictionary_type(Schema__ToString(self)),
     num_fields = function() Schema__num_fields(self),
     field = function(i) shared_ptr(Field, Schema__field(self, i)),
     serialize = function() Schema__serialize(self),

diff --git a/r/R/table.R b/r/R/table.R
@@ -104,6 +104,7 @@ Table <- R6Class("Table", inherit = Object,
     field = function(i) shared_ptr(Field, Table__field(self, i)),
 
     serialize = function(output_stream, ...) write_table(self, output_stream, ...),
+    ToString = function() ToString_tabular(self),
 
     cast = function(target_schema, safe = TRUE, options = cast_options(safe)) {
       assert_is(target_schema, "Schema")

diff --git a/r/R/type.R b/r/R/type.R
@@ -16,12 +16,6 @@
 # under the License.
 
 #' @include arrow-package.R
-
-#' @export
-`!=.Object` <- function(lhs, rhs){
-  !(lhs == rhs)
-}
-
 #' @title class arrow::DataType
 #'
 #' @usage NULL

diff --git a/r/README.Rmd b/r/README.Rmd
@@ -54,9 +54,13 @@ When installing from source, if the R and C++ library versions do not match, ins
 library(arrow)
 set.seed(24)
 
-tab <- Table$create(x = 1:10, y = rnorm(10))
-tab$schema
+tab <- Table$create(
+  x = 1:10,
+  y = rnorm(10),
+  z = as.factor(rep(c("b", "c"), 5))
+)
 tab
+tab$x
 as.data.frame(tab)
 ```
 

diff --git a/r/README.md b/r/README.md
@@ -69,25 +69,44 @@ Arrow C++ library first.
 library(arrow)
 set.seed(24)
 
-tab <- Table$create(x = 1:10, y = rnorm(10))
-tab$schema
-#> Schema 
-#> x: int32
-#> y: double
+tab <- Table$create(
+  x = 1:10,
+  y = rnorm(10),
+  z = as.factor(rep(c("b", "c"), 5))
+)
 tab
 #> Table
+#> 10 rows x 3 columns
+#> $x <int32>
+#> $y <double>
+#> $z <dictionary<values=string, indices=int8>>
+tab$x
+#> ChunkedArray
+#> <int32>
+#> [
+#>   1,
+#>   2,
+#>   3,
+#>   4,
+#>   5,
+#>   6,
+#>   7,
+#>   8,
+#>   9,
+#>   10
+#> ]
 as.data.frame(tab)
-#>     x            y
-#> 1   1 -0.545880758
-#> 2   2  0.536585304
-#> 3   3  0.419623149
-#> 4   4 -0.583627199
-#> 5   5  0.847460017
-#> 6   6  0.266021979
-#> 7   7  0.444585270
-#> 8   8 -0.466495124
-#> 9   9 -0.848370044
-#> 10 10  0.002311942
+#>     x            y z
+#> 1   1 -0.545880758 b
+#> 2   2  0.536585304 c
+#> 3   3  0.419623149 b
+#> 4   4 -0.583627199 c
+#> 5   5  0.847460017 b
+#> 6   6  0.266021979 c
+#> 7   7  0.444585270 b
+#> 8   8 -0.466495124 c
+#> 9   9 -0.848370044 b
+#> 10 10  0.002311942 c
 ```
 
 ## Installing a development version

diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
@@ -52,6 +52,11 @@ test_that("Array", {
   expect_equal(z_dbl$as_vector(), as.numeric(4:5))
 })
 
+test_that("Array print method includes type", {
+  x <- Array$create(c(1:10, 1:10, 1:5))
+  expect_output(print(x), "Array\n<int32>\n[\n", fixed = TRUE)
+})
+
 test_that("Array supports NA", {
   x_int <- Array$create(as.integer(c(1:10, NA)))
   x_dbl <- Array$create(as.numeric(c(1:10, NA)))
@@ -257,7 +262,7 @@ test_that("array supports integer64", {
   expect_true(a$IsNull(3L))
 })
 
-test_that("array$as_vector() correctly handles all NA inte64 (ARROW-3795)", {
+test_that("array$as_vector() correctly handles all NA int64 (ARROW-3795)", {
   x <- bit64::as.integer64(NA)
   a <- Array$create(x)
   expect_true(is.na(a$as_vector()))

diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
@@ -34,7 +34,7 @@ test_that("RecordBatch", {
     schema(
       int = int32(), dbl = float64(),
       lgl = boolean(), chr = utf8(),
-      fct = dictionary(int32(), Array$create(letters[1:10]))
+      fct = dictionary()
     )
   )
   expect_equal(batch$num_columns, 5L)
@@ -69,12 +69,12 @@ test_that("RecordBatch", {
   col_fct <- batch$column(4)
   expect_true(inherits(col_fct, 'Array'))
   expect_equal(col_fct$as_vector(), tbl$fct)
-  expect_equal(col_fct$type, dictionary(int32(), Array$create(letters[1:10])))
+  expect_equal(col_fct$type, dictionary())
 
   batch2 <- batch$RemoveColumn(0)
   expect_equal(
     batch2$schema,
-    schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int32(), Array$create(letters[1:10])))
+    schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary())
   )
   expect_equal(batch2$column(0), batch$column(1))
   expect_identical(as.data.frame(batch2), tbl[,-1])
@@ -120,6 +120,23 @@ test_that("head and tail on RecordBatch", {
   expect_identical(as.data.frame(tail(batch, -4)), tail(tbl, -4))
 })
 
+test_that("RecordBatch print method", {
+  expect_output(
+    print(batch),
+    paste(
+      "RecordBatch",
+      "10 rows x 5 columns",
+      "$int <int32>",
+      "$dbl <double>",
+      "$lgl <bool>",
+      "$chr <string>",
+      "$fct <dictionary<values=string, indices=int8>>",
+      sep = "\n"
+    ),
+    fixed = TRUE
+  )
+})
+
 test_that("RecordBatch with 0 rows are supported", {
   tbl <- tibble::tibble(
     int = integer(),
@@ -139,7 +156,7 @@ test_that("RecordBatch with 0 rows are supported", {
       dbl = float64(),
       lgl = boolean(),
       chr = utf8(),
-      fct = dictionary(int32(), Array$create(c("a", "b")))
+      fct = dictionary()
     )
   )
 })

diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
@@ -119,6 +119,23 @@ test_that("head and tail on Table", {
   expect_identical(as.data.frame(tail(tab, -4)), tail(tbl, -4))
 })
 
+test_that("Table print method", {
+  expect_output(
+    print(tab),
+    paste(
+      "Table",
+      "10 rows x 5 columns",
+      "$int <int32>",
+      "$dbl <double>",
+      "$lgl <bool>",
+      "$chr <string>",
+      "$fct <dictionary<values=string, indices=int8>>",
+      sep = "\n"
+    ),
+    fixed = TRUE
+  )
+})
+
 test_that("table active bindings", {
   expect_identical(dim(tbl), dim(tab))
   expect_is(tab$columns, "list")