From c6a5bf60b9d92dde5be2d0b60af42acf92095aa4 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Sat, 4 Feb 2017 11:58:54 +0100 Subject: [PATCH] Add Hive datatype (char/varchar) to struct field metadata. This fixes issues with char/varchar columns in ORC. --- .../sql/catalyst/parser/AstBuilder.scala | 27 ++++++++++++++++-- .../org/apache/spark/sql/types/package.scala | 8 +++++- .../spark/sql/sources/TableScanSuite.scala | 8 ++++-- .../data/files/orc/orc_text_types.orc | Bin 0 -> 395 bytes .../spark/sql/hive/orc/OrcSourceSuite.scala | 21 ++++++++++++++ 5 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 sql/hive/src/test/resources/data/files/orc/orc_text_types.orc diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 3969fdb0ffee5..15321146a6cc0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1457,8 +1457,31 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { */ override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) { import ctx._ - val structField = StructField(identifier.getText, typedVisit(dataType), nullable = true) - if (STRING == null) structField else structField.withComment(string(STRING)) + + val builder = new MetadataBuilder + // Add comment to metadata + if (STRING != null) { + builder.putString("comment", string(STRING)) + } + // Add Hive type string to metadata. + dataType match { + case p: PrimitiveDataTypeContext => + val dt = p.identifier.getText.toLowerCase + (dt, p.INTEGER_VALUE().asScala.toList) match { + case ("varchar" | "char", Nil) => + builder.putString(HIVE_TYPE_STRING, dt) + case ("varchar" | "char", size :: Nil) => + builder.putString(HIVE_TYPE_STRING, dt + "(" + size.getText + ")") + case _ => + } + case _ => + } + + StructField( + identifier.getText, + typedVisit(dataType), + nullable = true, + builder.build()) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala index 346a51ea10c82..c7936c34a3c5b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala @@ -21,4 +21,10 @@ package org.apache.spark.sql * Contains a type system for attributes produced by relations, including complex types like * structs, arrays and maps. */ -package object types +package object types { + /** + * Metadata key used to store the Hive type name. This is relevant for datatypes that do not + * have a direct Spark SQL counterpart, such as CHAR and VARCHAR. + */ + val HIVE_TYPE_STRING = "HIVE_TYPE_STRING" +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index 86bcb4d4b00c1..a170deb8e649f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -203,6 +203,10 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext { (2 to 10).map(i => Row(i, i - 1)).toSeq) test("Schema and all fields") { + def hiveMetadata(dt: String): Metadata = { + new MetadataBuilder().putString("HIVE_TYPE_STRING", dt).build() + } + val expectedSchema = StructType( StructField("string$%Field", StringType, true) :: StructField("binaryField", BinaryType, true) :: @@ -217,8 +221,8 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext { StructField("decimalField2", DecimalType(9, 2), true) :: StructField("dateField", DateType, true) :: StructField("timestampField", TimestampType, true) :: - StructField("varcharField", StringType, true) :: - StructField("charField", StringType, true) :: + StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) :: + StructField("charField", StringType, true, hiveMetadata("char(18)")) :: StructField("arrayFieldSimple", ArrayType(IntegerType), true) :: StructField("arrayFieldComplex", ArrayType( diff --git a/sql/hive/src/test/resources/data/files/orc/orc_text_types.orc b/sql/hive/src/test/resources/data/files/orc/orc_text_types.orc new file mode 100644 index 0000000000000000000000000000000000000000..e27f6e5240a4a1bbbe151941de2c0c44ae199549 GIT binary patch literal 395 zcmeYdau#G@;9?VE;b074FkoPKEc84nm4Q*wj?Yb%Pw$wE;xQM_WejW#WF zKq*bVbtN?QkEMV$GBYqFvNJF^Ff?#6Fn9+wFfssnZVU^6;>i&4 zIY6V69w!JVOnC5&>0*!2rG#0YPJBMb%|Ck_3dI!8vMjlIphk_8&tN|1sr0r}#@-t^ z4GRqz425~TIjx^96H8T7fA#x`_;KrJOP4=eBF4;+05tH^gQSNEJP8jTJx&OD^dQMc zB*|w=0@Ekah8iTNE0m~k8>%&zFfd#IYDo~=aZ2Op(Stey`U{r`tT?zroV#7iMPvAZKv^s_lQx literal 0 HcmV?d00001 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index fe1e17dd0805a..d31d9015b2405 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -162,6 +162,27 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar") } } + + test("read varchar column from orc tables created by hive") { + try { + // This is an ORC file with a single VARCHAR(10) column that's created using Hive 1.2.1 + val hiveOrc = new File(Thread.currentThread().getContextClassLoader + .getResource(s"data/files/orc/").getFile).toURI + sql( + s""" + |CREATE EXTERNAL TABLE test_hive_orc( + | a STRING, + | b CHAR(10), + | c VARCHAR(10) + |) + |STORED AS ORC + |LOCATION '$hiveOrc' + """.stripMargin) + checkAnswer(spark.table("test_hive_orc"), Row("a", "b ", "c")) + } finally { + sql("DROP TABLE IF EXISTS test_hive_orc") + } + } } class OrcSourceSuite extends OrcSuite {