From c6a5bf60b9d92dde5be2d0b60af42acf92095aa4 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Sat, 4 Feb 2017 11:58:54 +0100 Subject: [PATCH 1/7] Add Hive datatype (char/varchar) to struct field metadata. This fixes issues with char/varchar columns in ORC. --- .../sql/catalyst/parser/AstBuilder.scala | 27 ++++++++++++++++-- .../org/apache/spark/sql/types/package.scala | 8 +++++- .../spark/sql/sources/TableScanSuite.scala | 8 ++++-- .../data/files/orc/orc_text_types.orc | Bin 0 -> 395 bytes .../spark/sql/hive/orc/OrcSourceSuite.scala | 21 ++++++++++++++ 5 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 sql/hive/src/test/resources/data/files/orc/orc_text_types.orc diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 3969fdb0ffee5..15321146a6cc0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1457,8 +1457,31 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { */ override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) { import ctx._ - val structField = StructField(identifier.getText, typedVisit(dataType), nullable = true) - if (STRING == null) structField else structField.withComment(string(STRING)) + + val builder = new MetadataBuilder + // Add comment to metadata + if (STRING != null) { + builder.putString("comment", string(STRING)) + } + // Add Hive type string to metadata. + dataType match { + case p: PrimitiveDataTypeContext => + val dt = p.identifier.getText.toLowerCase + (dt, p.INTEGER_VALUE().asScala.toList) match { + case ("varchar" | "char", Nil) => + builder.putString(HIVE_TYPE_STRING, dt) + case ("varchar" | "char", size :: Nil) => + builder.putString(HIVE_TYPE_STRING, dt + "(" + size.getText + ")") + case _ => + } + case _ => + } + + StructField( + identifier.getText, + typedVisit(dataType), + nullable = true, + builder.build()) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala index 346a51ea10c82..c7936c34a3c5b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala @@ -21,4 +21,10 @@ package org.apache.spark.sql * Contains a type system for attributes produced by relations, including complex types like * structs, arrays and maps. */ -package object types +package object types { + /** + * Metadata key used to store the Hive type name. This is relevant for datatypes that do not + * have a direct Spark SQL counterpart, such as CHAR and VARCHAR. + */ + val HIVE_TYPE_STRING = "HIVE_TYPE_STRING" +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index 86bcb4d4b00c1..a170deb8e649f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -203,6 +203,10 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext { (2 to 10).map(i => Row(i, i - 1)).toSeq) test("Schema and all fields") { + def hiveMetadata(dt: String): Metadata = { + new MetadataBuilder().putString("HIVE_TYPE_STRING", dt).build() + } + val expectedSchema = StructType( StructField("string$%Field", StringType, true) :: StructField("binaryField", BinaryType, true) :: @@ -217,8 +221,8 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext { StructField("decimalField2", DecimalType(9, 2), true) :: StructField("dateField", DateType, true) :: StructField("timestampField", TimestampType, true) :: - StructField("varcharField", StringType, true) :: - StructField("charField", StringType, true) :: + StructField("varcharField", StringType, true, hiveMetadata("varchar(12)")) :: + StructField("charField", StringType, true, hiveMetadata("char(18)")) :: StructField("arrayFieldSimple", ArrayType(IntegerType), true) :: StructField("arrayFieldComplex", ArrayType( diff --git a/sql/hive/src/test/resources/data/files/orc/orc_text_types.orc b/sql/hive/src/test/resources/data/files/orc/orc_text_types.orc new file mode 100644 index 0000000000000000000000000000000000000000..e27f6e5240a4a1bbbe151941de2c0c44ae199549 GIT binary patch literal 395 zcmeYdau#G@;9?VE;b074FkoPKEc84nm4Q*wj?Yb%Pw$wE;xQM_WejW#WF zKq*bVbtN?QkEMV$GBYqFvNJF^Ff?#6Fn9+wFfssnZVU^6;>i&4 zIY6V69w!JVOnC5&>0*!2rG#0YPJBMb%|Ck_3dI!8vMjlIphk_8&tN|1sr0r}#@-t^ z4GRqz425~TIjx^96H8T7fA#x`_;KrJOP4=eBF4;+05tH^gQSNEJP8jTJx&OD^dQMc zB*|w=0@Ekah8iTNE0m~k8>%&zFfd#IYDo~=aZ2Op(Stey`U{r`tT?zroV#7iMPvAZKv^s_lQx literal 0 HcmV?d00001 diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index fe1e17dd0805a..d31d9015b2405 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -162,6 +162,27 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar") } } + + test("read varchar column from orc tables created by hive") { + try { + // This is an ORC file with a single VARCHAR(10) column that's created using Hive 1.2.1 + val hiveOrc = new File(Thread.currentThread().getContextClassLoader + .getResource(s"data/files/orc/").getFile).toURI + sql( + s""" + |CREATE EXTERNAL TABLE test_hive_orc( + | a STRING, + | b CHAR(10), + | c VARCHAR(10) + |) + |STORED AS ORC + |LOCATION '$hiveOrc' + """.stripMargin) + checkAnswer(spark.table("test_hive_orc"), Row("a", "b ", "c")) + } finally { + sql("DROP TABLE IF EXISTS test_hive_orc") + } + } } class OrcSourceSuite extends OrcSuite { From 277ed15ff48779c271cea585399957ce67d7caa6 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Sun, 5 Feb 2017 17:05:40 +0100 Subject: [PATCH 2/7] Update comment. --- .../scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index d31d9015b2405..b8be2985c3784 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -165,7 +165,8 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA test("read varchar column from orc tables created by hive") { try { - // This is an ORC file with a single VARCHAR(10) column that's created using Hive 1.2.1 + // This is an ORC file with a STRING, a CHAR(10) and a VARCHAR(10) column that has been + // created using Hive 1.2.1 val hiveOrc = new File(Thread.currentThread().getContextClassLoader .getResource(s"data/files/orc/").getFile).toURI sql( From 64c37e031c9da86a6ea70ef6e00968790f961fe4 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Tue, 7 Feb 2017 16:59:44 +0100 Subject: [PATCH 3/7] Code Review: Make changes more consistent, and generate our own test data. --- .../sql/catalyst/parser/AstBuilder.scala | 9 ++-- .../org/apache/spark/sql/types/package.scala | 6 ++- .../spark/sql/sources/TableScanSuite.scala | 2 +- .../org/apache/spark/sql/hive/HiveUtils.scala | 8 ---- .../spark/sql/hive/MetastoreRelation.scala | 6 +-- .../sql/hive/client/HiveClientImpl.scala | 8 ++-- .../org/apache/spark/sql/hive/hiveUDFs.scala | 3 ++ .../data/files/orc/orc_text_types.orc | Bin 395 -> 0 bytes .../spark/sql/hive/orc/OrcSourceSuite.scala | 44 +++++++++++------- 9 files changed, 46 insertions(+), 40 deletions(-) delete mode 100644 sql/hive/src/test/resources/data/files/orc/orc_text_types.orc diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index 15321146a6cc0..bb07558c814d2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -1466,12 +1466,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { // Add Hive type string to metadata. dataType match { case p: PrimitiveDataTypeContext => - val dt = p.identifier.getText.toLowerCase - (dt, p.INTEGER_VALUE().asScala.toList) match { - case ("varchar" | "char", Nil) => - builder.putString(HIVE_TYPE_STRING, dt) - case ("varchar" | "char", size :: Nil) => - builder.putString(HIVE_TYPE_STRING, dt + "(" + size.getText + ")") + p.identifier.getText.toLowerCase match { + case "varchar" | "char" => + builder.putString(HIVE_TYPE_STRING, dataType.getText.toLowerCase) case _ => } case _ => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala index c7936c34a3c5b..1f3de962781f9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala @@ -23,8 +23,10 @@ package org.apache.spark.sql */ package object types { /** - * Metadata key used to store the Hive type name. This is relevant for datatypes that do not - * have a direct Spark SQL counterpart, such as CHAR and VARCHAR. + * Metadata key used to store the the raw hive type string in the metadata of StructField. This + * is relevant for datatypes that do not have a direct Spark SQL counterpart, such as CHAR and + * VARCHAR. We need to preserve the original type in order to invoke the correct object + * inspector in Hive. */ val HIVE_TYPE_STRING = "HIVE_TYPE_STRING" } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index a170deb8e649f..b01d15eb917e2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -204,7 +204,7 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext { test("Schema and all fields") { def hiveMetadata(dt: String): Metadata = { - new MetadataBuilder().putString("HIVE_TYPE_STRING", dt).build() + new MetadataBuilder().putString(HIVE_TYPE_STRING, dt).build() } val expectedSchema = StructType( diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index 26b1994308f5d..81cd65c3cc337 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -54,14 +54,6 @@ private[spark] object HiveUtils extends Logging { /** The version of hive used internally by Spark SQL. */ val hiveExecutionVersion: String = "1.2.1" - /** - * The property key that is used to store the raw hive type string in the metadata of StructField. - * For example, in the case where the Hive type is varchar, the type gets mapped to a string type - * in Spark SQL, but we need to preserve the original type in order to invoke the correct object - * inspector in Hive. - */ - val hiveTypeString: String = "HIVE_TYPE_STRING" - val HIVE_METASTORE_VERSION = SQLConfigBuilder("spark.sql.hive.metastore.version") .doc("Version of the Hive metastore. Available options are " + s"0.12.0 through $hiveExecutionVersion.") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 346757c2047a7..3e8dfcbc5d5e7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.FileRelation -import org.apache.spark.sql.types.StructField +import org.apache.spark.sql.types._ private[hive] case class MetastoreRelation( @@ -61,8 +61,8 @@ private[hive] case class MetastoreRelation( override protected def otherCopyArgs: Seq[AnyRef] = catalogTable :: sparkSession :: Nil private def toHiveColumn(c: StructField): FieldSchema = { - val typeString = if (c.metadata.contains(HiveUtils.hiveTypeString)) { - c.metadata.getString(HiveUtils.hiveTypeString) + val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) { + c.metadata.getString(HIVE_TYPE_STRING) } else { c.dataType.catalogString } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index bf703a5ab6e60..f0d01ebfcfb09 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -47,7 +47,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.hive.HiveUtils -import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} +import org.apache.spark.sql.types._ import org.apache.spark.util.{CircularBuffer, Utils} /** @@ -790,8 +790,8 @@ private[hive] class HiveClientImpl( .asInstanceOf[Class[_ <: org.apache.hadoop.hive.ql.io.HiveOutputFormat[_, _]]] private def toHiveColumn(c: StructField): FieldSchema = { - val typeString = if (c.metadata.contains(HiveUtils.hiveTypeString)) { - c.metadata.getString(HiveUtils.hiveTypeString) + val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) { + c.metadata.getString(HIVE_TYPE_STRING) } else { c.dataType.catalogString } @@ -806,7 +806,7 @@ private[hive] class HiveClientImpl( throw new SparkException("Cannot recognize hive type string: " + hc.getType, e) } - val metadata = new MetadataBuilder().putString(HiveUtils.hiveTypeString, hc.getType).build() + val metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build() val field = StructField( name = hc.getName, dataType = columnType, diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index 4590197548104..c830edf60bee8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -51,6 +51,9 @@ private[hive] case class HiveSimpleUDF( @transient lazy val function = funcWrapper.createFunction[UDF]() + { + function + } @transient private lazy val method = function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo).asJava) diff --git a/sql/hive/src/test/resources/data/files/orc/orc_text_types.orc b/sql/hive/src/test/resources/data/files/orc/orc_text_types.orc deleted file mode 100644 index e27f6e5240a4a1bbbe151941de2c0c44ae199549..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 395 zcmeYdau#G@;9?VE;b074FkoPKEc84nm4Q*wj?Yb%Pw$wE;xQM_WejW#WF zKq*bVbtN?QkEMV$GBYqFvNJF^Ff?#6Fn9+wFfssnZVU^6;>i&4 zIY6V69w!JVOnC5&>0*!2rG#0YPJBMb%|Ck_3dI!8vMjlIphk_8&tN|1sr0r}#@-t^ z4GRqz425~TIjx^96H8T7fA#x`_;KrJOP4=eBF4;+05tH^gQSNEJP8jTJx&OD^dQMc zB*|w=0@Ekah8iTNE0m~k8>%&zFfd#IYDo~=aZ2Op(Stey`U{r`tT?zroV#7iMPvAZKv^s_lQx diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index b8be2985c3784..09a78f7692415 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -163,25 +163,37 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA } } - test("read varchar column from orc tables created by hive") { + test("SPARK-19459: read char/varchar column written by Hive") { + val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client + val location = Utils.createTempDir().toURI try { - // This is an ORC file with a STRING, a CHAR(10) and a VARCHAR(10) column that has been - // created using Hive 1.2.1 - val hiveOrc = new File(Thread.currentThread().getContextClassLoader - .getResource(s"data/files/orc/").getFile).toURI - sql( + hiveClient.runSqlHive( + """ + |CREATE EXTERNAL TABLE hive_orc( + | a STRING, + | b CHAR(10), + | c VARCHAR(10)) + |STORED AS orc""".stripMargin) + // Hive throws an exception if I assign the location in the create table statment. + hiveClient.runSqlHive( + s"ALTER TABLE hive_orc SET LOCATION '$location'") + hiveClient.runSqlHive( + "INSERT INTO TABLE hive_orc SELECT 'a', 'b', 'c' FROM (SELECT 1) t") + + // We create a different table in Spark using the same schema which points to + // the same location. + spark.sql( s""" - |CREATE EXTERNAL TABLE test_hive_orc( - | a STRING, - | b CHAR(10), - | c VARCHAR(10) - |) - |STORED AS ORC - |LOCATION '$hiveOrc' - """.stripMargin) - checkAnswer(spark.table("test_hive_orc"), Row("a", "b ", "c")) + |CREATE EXTERNAL TABLE spark_orc( + | a STRING, + | b CHAR(10), + | c VARCHAR(10)) + |STORED AS orc + |LOCATION '$location'""".stripMargin) + checkAnswer(spark.table("spark_orc"), Row("a", "b ", "c")) } finally { - sql("DROP TABLE IF EXISTS test_hive_orc") + hiveClient.runSqlHive("DROP TABLE IF EXISTS hive_orc") + hiveClient.runSqlHive("DROP TABLE IF EXISTS spark_orc") } } } From f42348a826692f7971ec4f95cc1ae3f293621ed5 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Tue, 7 Feb 2017 18:17:54 +0100 Subject: [PATCH 4/7] Fix after merge --- .../main/scala/org/apache/spark/sql/hive/HiveUtils.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index fc6680eb897a3..50c1dc4b77a77 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -457,8 +457,8 @@ private[spark] object HiveUtils extends Logging { /** Converts the native StructField to Hive's FieldSchema. */ private def toHiveColumn(c: StructField): FieldSchema = { - val typeString = if (c.metadata.contains(HiveUtils.hiveTypeString)) { - c.metadata.getString(HiveUtils.hiveTypeString) + val typeString = if (c.metadata.contains(HIVE_TYPE_STRING)) { + c.metadata.getString(HIVE_TYPE_STRING) } else { c.dataType.catalogString } @@ -474,7 +474,7 @@ private[spark] object HiveUtils extends Logging { throw new SparkException("Cannot recognize hive type string: " + hc.getType, e) } - val metadata = new MetadataBuilder().putString(HiveUtils.hiveTypeString, hc.getType).build() + val metadata = new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build() val field = StructField( name = hc.getName, dataType = columnType, From 673168ee46c020dd894758a13560e7e60eea8ed8 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Wed, 8 Feb 2017 16:05:15 +0100 Subject: [PATCH 5/7] Code Review. --- .../org/apache/spark/sql/types/package.scala | 2 +- .../org/apache/spark/sql/hive/hiveUDFs.scala | 3 --- .../spark/sql/hive/orc/OrcSourceSuite.scala | 19 +++++-------------- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala index 1f3de962781f9..f29cbc2069e39 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/package.scala @@ -23,7 +23,7 @@ package org.apache.spark.sql */ package object types { /** - * Metadata key used to store the the raw hive type string in the metadata of StructField. This + * Metadata key used to store the raw hive type string in the metadata of StructField. This * is relevant for datatypes that do not have a direct Spark SQL counterpart, such as CHAR and * VARCHAR. We need to preserve the original type in order to invoke the correct object * inspector in Hive. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index c830edf60bee8..4590197548104 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -51,9 +51,6 @@ private[hive] case class HiveSimpleUDF( @transient lazy val function = funcWrapper.createFunction[UDF]() - { - function - } @transient private lazy val method = function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo).asJava) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index 09a78f7692415..a05b29c579b34 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -152,18 +152,7 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA assert(new OrcOptions(Map("Orc.Compress" -> "NONE")).compressionCodec == "NONE") } - test("SPARK-18220: read Hive orc table with varchar column") { - val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client - try { - hiveClient.runSqlHive("CREATE TABLE orc_varchar(a VARCHAR(10)) STORED AS orc") - hiveClient.runSqlHive("INSERT INTO TABLE orc_varchar SELECT 'a' FROM (SELECT 1) t") - checkAnswer(spark.table("orc_varchar"), Row("a")) - } finally { - hiveClient.runSqlHive("DROP TABLE IF EXISTS orc_varchar") - } - } - - test("SPARK-19459: read char/varchar column written by Hive") { + test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") { val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client val location = Utils.createTempDir().toURI try { @@ -174,7 +163,7 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA | b CHAR(10), | c VARCHAR(10)) |STORED AS orc""".stripMargin) - // Hive throws an exception if I assign the location in the create table statment. + // Hive throws an exception if I assign the location in the create table statement. hiveClient.runSqlHive( s"ALTER TABLE hive_orc SET LOCATION '$location'") hiveClient.runSqlHive( @@ -190,7 +179,9 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA | c VARCHAR(10)) |STORED AS orc |LOCATION '$location'""".stripMargin) - checkAnswer(spark.table("spark_orc"), Row("a", "b ", "c")) + val result = Row("a", "b ", "c") + checkAnswer(spark.table("hive_orc"), result) + checkAnswer(spark.table("spark_orc"), result) } finally { hiveClient.runSqlHive("DROP TABLE IF EXISTS hive_orc") hiveClient.runSqlHive("DROP TABLE IF EXISTS spark_orc") From e7ca0ead843f2c9650e690fe649be18fa6389e48 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Wed, 8 Feb 2017 16:06:46 +0100 Subject: [PATCH 6/7] Revert change to MetastoreRelation --- .../scala/org/apache/spark/sql/hive/MetastoreRelation.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala index 10380279916dc..6394eb6da5173 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/MetastoreRelation.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.FileRelation -import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.StructField private[hive] case class MetastoreRelation( From 21be4ca7f4fb987ab467c6bcbed2db10e0cdbeb9 Mon Sep 17 00:00:00 2001 From: Herman van Hovell Date: Thu, 9 Feb 2017 16:25:37 +0100 Subject: [PATCH 7/7] Remove temp file --- .../org/apache/spark/sql/hive/orc/OrcSourceSuite.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index a05b29c579b34..59ea8916efae9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -154,7 +154,8 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") { val hiveClient = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client - val location = Utils.createTempDir().toURI + val location = Utils.createTempDir() + val uri = location.toURI try { hiveClient.runSqlHive( """ @@ -165,7 +166,7 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA |STORED AS orc""".stripMargin) // Hive throws an exception if I assign the location in the create table statement. hiveClient.runSqlHive( - s"ALTER TABLE hive_orc SET LOCATION '$location'") + s"ALTER TABLE hive_orc SET LOCATION '$uri'") hiveClient.runSqlHive( "INSERT INTO TABLE hive_orc SELECT 'a', 'b', 'c' FROM (SELECT 1) t") @@ -178,13 +179,14 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA | b CHAR(10), | c VARCHAR(10)) |STORED AS orc - |LOCATION '$location'""".stripMargin) + |LOCATION '$uri'""".stripMargin) val result = Row("a", "b ", "c") checkAnswer(spark.table("hive_orc"), result) checkAnswer(spark.table("spark_orc"), result) } finally { hiveClient.runSqlHive("DROP TABLE IF EXISTS hive_orc") hiveClient.runSqlHive("DROP TABLE IF EXISTS spark_orc") + Utils.deleteRecursively(location) } } }