-
Notifications
You must be signed in to change notification settings - Fork 28.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-20493][R] De-duplicate parse logics for DDL-like type strings in R #17785
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ import org.apache.spark.broadcast.Broadcast | |
import org.apache.spark.rdd.RDD | ||
import org.apache.spark.sql._ | ||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema | ||
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser | ||
import org.apache.spark.sql.execution.command.ShowTablesCommand | ||
import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
import org.apache.spark.sql.types._ | ||
|
@@ -92,48 +93,8 @@ private[sql] object SQLUtils extends Logging { | |
def r: Regex = new Regex(sc.parts.mkString, sc.parts.tail.map(_ => "x"): _*) | ||
} | ||
|
||
def getSQLDataType(dataType: String): DataType = { | ||
dataType match { | ||
case "byte" => org.apache.spark.sql.types.ByteType | ||
case "integer" => org.apache.spark.sql.types.IntegerType | ||
case "float" => org.apache.spark.sql.types.FloatType | ||
case "double" => org.apache.spark.sql.types.DoubleType | ||
case "numeric" => org.apache.spark.sql.types.DoubleType | ||
case "character" => org.apache.spark.sql.types.StringType | ||
case "string" => org.apache.spark.sql.types.StringType | ||
case "binary" => org.apache.spark.sql.types.BinaryType | ||
case "raw" => org.apache.spark.sql.types.BinaryType | ||
case "logical" => org.apache.spark.sql.types.BooleanType | ||
case "boolean" => org.apache.spark.sql.types.BooleanType | ||
case "timestamp" => org.apache.spark.sql.types.TimestampType | ||
case "date" => org.apache.spark.sql.types.DateType | ||
case r"\Aarray<(.+)${elemType}>\Z" => | ||
org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType)) | ||
case r"\Amap<(.+)${keyType},(.+)${valueType}>\Z" => | ||
if (keyType != "string" && keyType != "character") { | ||
throw new IllegalArgumentException("Key type of a map must be string or character") | ||
} | ||
org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType)) | ||
case r"\Astruct<(.+)${fieldsStr}>\Z" => | ||
if (fieldsStr(fieldsStr.length - 1) == ',') { | ||
throw new IllegalArgumentException(s"Invalid type $dataType") | ||
} | ||
val fields = fieldsStr.split(",") | ||
val structFields = fields.map { field => | ||
field match { | ||
case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" => | ||
createStructField(fieldName, fieldType, true) | ||
|
||
case _ => throw new IllegalArgumentException(s"Invalid type $dataType") | ||
} | ||
} | ||
createStructType(structFields) | ||
case _ => throw new IllegalArgumentException(s"Invalid type $dataType") | ||
} | ||
} | ||
|
||
def createStructField(name: String, dataType: String, nullable: Boolean): StructField = { | ||
val dtObj = getSQLDataType(dataType) | ||
val dtObj = CatalystSqlParser.parseDataType(dataType) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. haven't checked myself, what are the differences if any between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Up to my knowledge,
and these look required to be case-sensitive whereas
and these look case-insensitive. I think the inital intention for If the types are not in
In short, I think there should not be a behaviour change below types (intersection between
and these should be case-sensitive. Additionally, we will support the types below (which are written in R's
Before > structField("_col", "tinyint")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type tinyint
at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
... > structField("_col", "smallint")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type smallint
at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
...
> structField("_col", "int")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type int
at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
...
> structField("_col", "bigint")
...
Error in handleErrors(returnStatus, conn) :
java.lang.IllegalArgumentException: Invalid type bigint
at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
... > structField("_col", "decimal")
...
java.lang.IllegalArgumentException: Invalid type decimal
at org.apache.spark.sql.api.r.SQLUtils$.getSQLDataType(SQLUtils.scala:131)
at org.apache.spark.sql.api.r.SQLUtils$.createStructField(SQLUtils.scala:136)
at org.apache.spark.sql.api.r.SQLUtils.createStructField(SQLUtils.scala)
... After > structField("_col", "tinyint")
StructField(name = "_col", type = "ByteType", nullable = TRUE)> > structField("_col", "smallint")
StructField(name = "_col", type = "ShortType", nullable = TRUE)> > structField("_col", "int")
StructField(name = "_col", type = "IntegerType", nullable = TRUE)> > structField("_col", "bigint")
StructField(name = "_col", type = "LongType", nullable = TRUE)> > structField("_col", "decimal")
StructField(name = "_col", type = "DecimalType(10,0)", nullable = TRUE)> There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just wrote the details about this at my best. Yes, I think this should be targeted to master not 2.2. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for looking into it. if I take the diff,
these are actually R native type names though, for which if I have to guess, is intentional that we support R native type in structField as well as Scala/Spark types. I'm not sure how much coverage we have for something like this but is that going to still work with this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, however, for those types, we can't create that field because the check via checkType fails as it is not in the keys in > structField("_col", "character")
Error in checkType(type) : Unsupported type for SparkDataframe: character
> structField("_col", "logical")
Error in checkType(type) : Unsupported type for SparkDataframe: logical
> structField("_col", "numeric")
Error in checkType(type) : Unsupported type for SparkDataframe: numeric
> structField("_col", "raw")
Error in checkType(type) : Unsupported type for SparkDataframe: raw I double-checked this is the only place where we called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see - then I suppose this was "broken" when checkType was added / over the past 2 years or so. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, probably those 12 cases in test_sparkSQL.R#L143-L194 and 5 cases in #17785 (comment) of total 17 cases in checkType above will be covered. Let me double check them and will leave a comment here today. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me add positive cases for ...
and negative cases for ...
|
||
StructField(name, dtObj, nullable) | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
indent