Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-19459][SQL] Add Hive datatype (char/varchar) to StructField metadata #16804

Closed
wants to merge 10 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -1457,8 +1457,31 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
*/
override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) {
import ctx._
val structField = StructField(identifier.getText, typedVisit(dataType), nullable = true)
if (STRING == null) structField else structField.withComment(string(STRING))

val builder = new MetadataBuilder
// Add comment to metadata
if (STRING != null) {
builder.putString("comment", string(STRING))
}
// Add Hive type string to metadata.
dataType match {
case p: PrimitiveDataTypeContext =>
val dt = p.identifier.getText.toLowerCase
(dt, p.INTEGER_VALUE().asScala.toList) match {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

p.identifier.getText.toLowerCase match {
  case "varchar" | "char" => builder.putString(HIVE_TYPE_STRING, dataType.getText.toLowerCase)
}

case ("varchar" | "char", Nil) =>
builder.putString(HIVE_TYPE_STRING, dt)
case ("varchar" | "char", size :: Nil) =>
builder.putString(HIVE_TYPE_STRING, dt + "(" + size.getText + ")")
case _ =>
}
case _ =>
}

StructField(
identifier.getText,
typedVisit(dataType),
nullable = true,
builder.build())
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ package org.apache.spark.sql
* Contains a type system for attributes produced by relations, including complex types like
* structs, arrays and maps.
*/
package object types
package object types {
/**
* Metadata key used to store the Hive type name. This is relevant for datatypes that do not
* have a direct Spark SQL counterpart, such as CHAR and VARCHAR.
*/
val HIVE_TYPE_STRING = "HIVE_TYPE_STRING"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we remove HiveUtils. HIVE_TYPE_STRING?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah we should.

}
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
(2 to 10).map(i => Row(i, i - 1)).toSeq)

test("Schema and all fields") {
def hiveMetadata(dt: String): Metadata = {
new MetadataBuilder().putString("HIVE_TYPE_STRING", dt).build()
}

val expectedSchema = StructType(
StructField("string$%Field", StringType, true) ::
StructField("binaryField", BinaryType, true) ::
Expand All @@ -217,8 +221,8 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
StructField("decimalField2", DecimalType(9, 2), true) ::
StructField("dateField", DateType, true) ::
StructField("timestampField", TimestampType, true) ::
StructField("varcharField", StringType, true) ::
StructField("charField", StringType, true) ::
StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) ::
StructField("charField", StringType, true, hiveMetadata("char(18)")) ::
StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
StructField("arrayFieldComplex",
ArrayType(
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,27 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar")
}
}

test("read varchar column from orc tables created by hive") {
try {
Copy link
Contributor

@cloud-fan cloud-fan Feb 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about

    val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
     try {
       hiveClient.runSqlHive("CREATE TABLE hive_orc(a VARCHAR(10)) STORED AS orc LOCATION xxx")
       hiveClient.runSqlHive("INSERT INTO TABLE hive_orc SELECT 'a' FROM (SELECT 1) t")
       sql("CREATE EXTERNAL TABLE spark_orc ...")
       checkAnswer...
     } finally {
        sql("DROP TABLE IF EXISTS ...")
        ...
      }

then we don't need to create the orc file manually.

// This is an ORC file with a single VARCHAR(10) column that's created using Hive 1.2.1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @hvanhovell .
Nit. It's three columns.

Structure for orc/orc_text_types.orc
File Version: 0.12 with HIVE_8732
Rows: 1
Compression: ZLIB
Compression size: 262144
Type: struct<_col0:string,_col1:char(10),_col2:varchar(10)>

val hiveOrc = new File(Thread.currentThread().getContextClassLoader
.getResource(s"data/files/orc/").getFile).toURI
sql(
s"""
|CREATE EXTERNAL TABLE test_hive_orc(
| a STRING,
| b CHAR(10),
| c VARCHAR(10)
|)
|STORED AS ORC
|LOCATION '$hiveOrc'
""".stripMargin)
checkAnswer(spark.table("test_hive_orc"), Row("a", "b ", "c"))
} finally {
sql("DROP TABLE IF EXISTS test_hive_orc")
}
}
}

class OrcSourceSuite extends OrcSuite {
Expand Down