-
Notifications
You must be signed in to change notification settings - Fork 28.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-23023][SQL] Cast field data to strings in showString #20214
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -237,13 +237,18 @@ class Dataset[T] private[sql]( | |
private[sql] def showString( | ||
_numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = { | ||
val numRows = _numRows.max(0).min(Int.MaxValue - 1) | ||
val takeResult = toDF().take(numRows + 1) | ||
val newDf = toDF() | ||
val castExprs = newDf.schema.map { f => f.dataType match { | ||
// Since binary types in top-level schema fields have a specific format to print, | ||
// so we do not cast them to strings here. | ||
case BinaryType => s"`${f.name}`" | ||
case _: UserDefinedType[_] => s"`${f.name}`" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added this entry for passing the existing tests in pyspark though, we still hit wired behaviours when casting user-defined types into strings;
This cast shows the internal data structure of user-define types. WDYT? cc: @cloud-fan @ueshin There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about something like: case udt: UserDefinedType[_] =>
(c, evPrim, evNull) => {
val udtTerm = ctx.addReferenceObj("udt", udt)
s"$evPrim = UTF8String.fromString($udtTerm.deserialize($c).toString());"
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, yea. I missed that. Thanks, I'll make a separate pr. |
||
case _ => s"CAST(`${f.name}` AS STRING)" | ||
}} | ||
val takeResult = newDf.selectExpr(castExprs: _*).take(numRows + 1) | ||
val hasMoreData = takeResult.length > numRows | ||
val data = takeResult.take(numRows) | ||
|
||
lazy val timeZone = | ||
DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone) | ||
|
||
// For array values, replace Seq and Array with square brackets | ||
// For cells that are beyond `truncate` characters, replace it with the | ||
// first `truncate-3` and "..." | ||
|
@@ -252,12 +257,6 @@ class Dataset[T] private[sql]( | |
val str = cell match { | ||
case null => "null" | ||
case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]") | ||
case array: Array[_] => array.mkString("[", ", ", "]") | ||
case seq: Seq[_] => seq.mkString("[", ", ", "]") | ||
case d: Date => | ||
DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d)) | ||
case ts: Timestamp => | ||
DateTimeUtils.timestampToString(DateTimeUtils.fromJavaTimestamp(ts), timeZone) | ||
case _ => cell.toString | ||
} | ||
if (truncate > 0 && str.length > truncate) { | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -1255,6 +1255,34 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { | |||
assert(testData.select($"*").showString(1, vertical = true) === expectedAnswer) | ||||
} | ||||
|
||||
test("SPARK-23023 Cast rows to strings in showString") { | ||||
val df1 = Seq(Seq(1, 2, 3, 4)).toDF("a") | ||||
assert(df1.showString(10) === | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know why it shows There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala Line 304 in 55dbfbc
|
||||
s"""+------------+ | ||||
|| a| | ||||
|+------------+ | ||||
||[1, 2, 3, 4]| | ||||
|+------------+ | ||||
|""".stripMargin) | ||||
val df2 = Seq(Map(1 -> "a", 2 -> "b")).toDF("a") | ||||
assert(df2.showString(10) === | ||||
s"""+----------------+ | ||||
|| a| | ||||
|+----------------+ | ||||
||[1 -> a, 2 -> b]| | ||||
|+----------------+ | ||||
|""".stripMargin) | ||||
val df3 = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b") | ||||
assert(df3.showString(10) === | ||||
s"""+------+---+ | ||||
|| a| b| | ||||
|+------+---+ | ||||
||[1, a]| 0| | ||||
||[2, b]| 0| | ||||
|+------+---+ | ||||
|""".stripMargin) | ||||
} | ||||
|
||||
test("SPARK-7327 show with empty dataFrame") { | ||||
val expectedAnswer = """+---+-----+ | ||||
||key|value| | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we use dataframe API? which looks more reliable here