Skip to content

Commit

Permalink
Add Hive datatype (char/varchar) to struct field metadata. This fixes…
Browse files Browse the repository at this point in the history
… issues with char/varchar columns in ORC.
  • Loading branch information
hvanhovell committed Feb 4, 2017
1 parent 2f3c20b commit c6a5bf6
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1457,8 +1457,31 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
*/
override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) {
import ctx._
val structField = StructField(identifier.getText, typedVisit(dataType), nullable = true)
if (STRING == null) structField else structField.withComment(string(STRING))

val builder = new MetadataBuilder
// Add comment to metadata
if (STRING != null) {
builder.putString("comment", string(STRING))
}
// Add Hive type string to metadata.
dataType match {
case p: PrimitiveDataTypeContext =>
val dt = p.identifier.getText.toLowerCase
(dt, p.INTEGER_VALUE().asScala.toList) match {
case ("varchar" | "char", Nil) =>
builder.putString(HIVE_TYPE_STRING, dt)
case ("varchar" | "char", size :: Nil) =>
builder.putString(HIVE_TYPE_STRING, dt + "(" + size.getText + ")")
case _ =>
}
case _ =>
}

StructField(
identifier.getText,
typedVisit(dataType),
nullable = true,
builder.build())
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ package org.apache.spark.sql
* Contains a type system for attributes produced by relations, including complex types like
* structs, arrays and maps.
*/
package object types
package object types {
/**
* Metadata key used to store the Hive type name. This is relevant for datatypes that do not
* have a direct Spark SQL counterpart, such as CHAR and VARCHAR.
*/
val HIVE_TYPE_STRING = "HIVE_TYPE_STRING"
}
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
(2 to 10).map(i => Row(i, i - 1)).toSeq)

test("Schema and all fields") {
def hiveMetadata(dt: String): Metadata = {
new MetadataBuilder().putString("HIVE_TYPE_STRING", dt).build()
}

val expectedSchema = StructType(
StructField("string$%Field", StringType, true) ::
StructField("binaryField", BinaryType, true) ::
Expand All @@ -217,8 +221,8 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
StructField("decimalField2", DecimalType(9, 2), true) ::
StructField("dateField", DateType, true) ::
StructField("timestampField", TimestampType, true) ::
StructField("varcharField", StringType, true) ::
StructField("charField", StringType, true) ::
StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) ::
StructField("charField", StringType, true, hiveMetadata("char(18)")) ::
StructField("arrayFieldSimple", ArrayType(IntegerType), true) ::
StructField("arrayFieldComplex",
ArrayType(
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,27 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar")
}
}

test("read varchar column from orc tables created by hive") {
try {
// This is an ORC file with a single VARCHAR(10) column that's created using Hive 1.2.1
val hiveOrc = new File(Thread.currentThread().getContextClassLoader
.getResource(s"data/files/orc/").getFile).toURI
sql(
s"""
|CREATE EXTERNAL TABLE test_hive_orc(
| a STRING,
| b CHAR(10),
| c VARCHAR(10)
|)
|STORED AS ORC
|LOCATION '$hiveOrc'
""".stripMargin)
checkAnswer(spark.table("test_hive_orc"), Row("a", "b ", "c"))
} finally {
sql("DROP TABLE IF EXISTS test_hive_orc")
}
}
}

class OrcSourceSuite extends OrcSuite {
Expand Down

0 comments on commit c6a5bf6

Please sign in to comment.