-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' into feature/259-spark-3.2
- Loading branch information
Showing
27 changed files
with
1,248 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
...or-default/src/main/resources/META-INF/services/za.co.absa.abris.avro.sql.SchemaConverter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# | ||
# Copyright 2018 ABSA Group Limited | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.AdvancedAvroToSparkConverter |
135 changes: 135 additions & 0 deletions
135
...ive/ingestor/implementation/transformer/avro/confluent/AdvancedAvroToSparkConverter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/* | ||
* Copyright 2018 ABSA Group Limited | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent | ||
|
||
import org.apache.avro.Schema | ||
import org.apache.avro.Schema.Type._ | ||
import org.apache.avro.util.internal.JacksonUtils | ||
import org.apache.spark.sql.avro.SchemaConverters | ||
import org.apache.spark.sql.types._ | ||
import org.codehaus.jackson.map.ObjectMapper | ||
import za.co.absa.abris.avro.sql.SchemaConverter | ||
import za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.SparkMetadataKeys._ | ||
|
||
import java.io.ByteArrayOutputStream | ||
import scala.collection.JavaConverters._ | ||
|
||
// scalastyle:off | ||
class AdvancedAvroToSparkConverter extends SchemaConverter { | ||
override val shortName: String = AdvancedAvroToSparkConverter.name | ||
private lazy val objectMapper = new ObjectMapper() | ||
|
||
case class SchemaType(dataType: DataType, nullable: Boolean, avroType: Option[Schema]) | ||
|
||
/** | ||
* This function takes an avro schema and returns a sql schema. | ||
*/ | ||
override def toSqlType(avroSchema: Schema): DataType = { | ||
toSqlTypeHelper(avroSchema, Set.empty).dataType | ||
} | ||
|
||
def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = { | ||
avroSchema.getType match { | ||
case RECORD => | ||
if (existingRecordNames.contains(avroSchema.getFullName)) { | ||
throw new IncompatibleSchemaException(s""" | ||
|Found recursive reference in Avro schema, which can not be processed by Spark: | ||
|${avroSchema.toString(true)} | ||
""".stripMargin) | ||
} | ||
val newRecordNames = existingRecordNames + avroSchema.getFullName | ||
val fields = avroSchema.getFields.asScala.map { f => | ||
val metadataBuilder = new MetadataBuilder() | ||
val defaultJsonOpt = Option(JacksonUtils.toJsonNode(f.defaultVal())) | ||
val metadataBuilderWithDefault = defaultJsonOpt match { | ||
case Some(defaultJson) => | ||
val baos = new ByteArrayOutputStream() | ||
objectMapper.writeValue(baos, defaultJson) | ||
val r = metadataBuilder.putString(DefaultValueKey, baos.toString) | ||
baos.close() | ||
r | ||
case None => metadataBuilder | ||
} | ||
|
||
val schemaType = toSqlTypeHelper(f.schema(), newRecordNames) | ||
schemaType.avroType | ||
.map(_.toString) | ||
.map(schema => metadataBuilderWithDefault.putString(AvroTypeKey, schema).build()) | ||
.map(metadata => StructField(f.name, schemaType.dataType, schemaType.nullable, metadata)) | ||
.getOrElse(StructField(f.name, schemaType.dataType, schemaType.nullable, metadataBuilderWithDefault.build())) | ||
} | ||
|
||
SchemaType(StructType(fields), nullable = false, None) | ||
|
||
case ARRAY => | ||
val schemaType = toSqlTypeHelper(avroSchema.getElementType, existingRecordNames) | ||
SchemaType( | ||
ArrayType(schemaType.dataType, containsNull = schemaType.nullable), | ||
nullable = false, | ||
schemaType.avroType) | ||
|
||
case MAP => | ||
val schemaType = toSqlTypeHelper(avroSchema.getValueType, existingRecordNames) | ||
SchemaType( | ||
MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), | ||
nullable = false, | ||
schemaType.avroType) | ||
|
||
case UNION => | ||
if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { | ||
// In case of a union with null, eliminate it and make a recursive call | ||
val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL) | ||
if (remainingUnionTypes.size == 1) { | ||
toSqlTypeHelper(remainingUnionTypes.head, existingRecordNames).copy(nullable = true) | ||
} else { | ||
toSqlTypeHelper(Schema.createUnion(remainingUnionTypes.asJava), existingRecordNames) | ||
.copy(nullable = true) | ||
} | ||
} else avroSchema.getTypes.asScala.map(_.getType) match { | ||
case Seq(t1) => | ||
toSqlTypeHelper(avroSchema.getTypes.get(0), existingRecordNames) | ||
case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => | ||
SchemaType(LongType, nullable = false, Option(avroSchema)) | ||
case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => | ||
SchemaType(DoubleType, nullable = false, Option(avroSchema)) | ||
case _ => | ||
// Convert complex unions to struct types where field names are member0, member1, etc. | ||
// This is consistent with the behavior when converting between Avro and Parquet. | ||
val fields = avroSchema.getTypes.asScala.zipWithIndex.map { | ||
case (s, i) => | ||
val schemaType = toSqlTypeHelper(s, existingRecordNames) | ||
schemaType.avroType | ||
.map(_.toString) | ||
.map(schema => new MetadataBuilder().putString(AvroTypeKey, schema).build()) | ||
.map(metadata => StructField(s"member$i", schemaType.dataType, schemaType.nullable, metadata)) | ||
// All fields are nullable because only one of them is set at a time | ||
.getOrElse(StructField(s"member$i", schemaType.dataType, nullable = true)) | ||
} | ||
|
||
SchemaType(StructType(fields), nullable = false, None) | ||
} | ||
|
||
case _ => | ||
val originalSchemaType = SchemaConverters.toSqlType(avroSchema) | ||
SchemaType(originalSchemaType.dataType, originalSchemaType.nullable, Option(avroSchema)) | ||
} | ||
} | ||
} | ||
|
||
// scalastyle:on | ||
object AdvancedAvroToSparkConverter { | ||
val name = "advanced" | ||
} |
131 changes: 131 additions & 0 deletions
131
...ive/ingestor/implementation/transformer/avro/confluent/AdvancedSparkToAvroConverter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
/* | ||
* Copyright 2018 ABSA Group Limited | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent | ||
|
||
import org.apache.avro.LogicalTypes.TimestampMillis | ||
import org.apache.avro.Schema.Type._ | ||
import org.apache.avro.util.internal.JacksonUtils | ||
import org.apache.avro.{JsonProperties, LogicalTypes, Schema, SchemaBuilder} | ||
import org.apache.spark.sql.avro.SchemaConverters | ||
import org.apache.spark.sql.types.Decimal.minBytesForPrecision | ||
import org.apache.spark.sql.types._ | ||
import org.codehaus.jackson.map.ObjectMapper | ||
|
||
import java.util.Objects | ||
import scala.util.Try | ||
import za.co.absa.hyperdrive.ingestor.implementation.transformer.avro.confluent.SparkMetadataKeys._ | ||
|
||
object AdvancedSparkToAvroConverter extends SparkToAvroConverter { | ||
private lazy val nullSchema = Schema.create(Schema.Type.NULL) | ||
private lazy val objectMapper = new ObjectMapper() | ||
|
||
override def apply(catalystType: DataType, nullable: Boolean, recordName: String, nameSpace: String): Schema = | ||
toAvroType(catalystType, None, nullable, None, recordName, nameSpace) | ||
|
||
// scalastyle:off | ||
private def toAvroType( | ||
catalystType: DataType, | ||
avroSchema: Option[Schema], | ||
nullable: Boolean = false, | ||
defaultValue: Option[Object] = None, | ||
recordName: String = "topLevelRecord", | ||
nameSpace: String = "") | ||
: Schema = { | ||
val builder = SchemaBuilder.builder() | ||
|
||
val schema = catalystType match { | ||
case TimestampType => avroSchema match { | ||
case Some(schema) if schema.getLogicalType.isInstanceOf[TimestampMillis] => | ||
LogicalTypes.timestampMillis().addToSchema(builder.longType()) | ||
case _ => LogicalTypes.timestampMicros().addToSchema(builder.longType()) | ||
} | ||
case d: DecimalType => avroSchema match { | ||
case Some(schema) if schema.getType == BYTES => | ||
val avroType = LogicalTypes.decimal(d.precision, d.scale) | ||
avroType.addToSchema(SchemaBuilder.builder().bytesType()) | ||
case _ => getDecimalFixedType(d, avroSchema, nameSpace, recordName) | ||
} | ||
case BinaryType => avroSchema match { | ||
case Some(schema) if schema.getType == FIXED => | ||
val name = getFixedName(recordName, nameSpace) | ||
builder | ||
.fixed(name) | ||
.size(schema.getFixedSize) | ||
case _ => builder.bytesType() | ||
} | ||
case ArrayType(et, containsNull) => | ||
builder.array() | ||
.items(toAvroType(et, avroSchema, containsNull, defaultValue, recordName, nameSpace)) | ||
case MapType(StringType, vt, valueContainsNull) => | ||
builder.map() | ||
.values(toAvroType(vt, avroSchema, valueContainsNull, defaultValue, recordName, nameSpace)) | ||
case st: StructType => | ||
val childNameSpace = if (nameSpace != "") s"$nameSpace.$recordName" else recordName | ||
val fieldsAssembler = builder.record(recordName).namespace(nameSpace).fields() | ||
st.foreach { f => | ||
val schema = Try(f.metadata.getString(AvroTypeKey)).toOption | ||
.map(schema => new Schema.Parser().parse(schema)) | ||
val defaultValueOpt = Try(f.metadata.getString(DefaultValueKey)) | ||
.flatMap(defaultJsonString => Try { | ||
val jsonNode = objectMapper.readTree(defaultJsonString) | ||
JacksonUtils.toObject(jsonNode) | ||
}).toOption | ||
val fieldAvroType = | ||
toAvroType(f.dataType, schema, f.nullable, defaultValueOpt, f.name, childNameSpace) | ||
defaultValueOpt match { | ||
case Some(defaultObject) if !Objects.equals(defaultObject, JsonProperties.NULL_VALUE) => | ||
fieldsAssembler.name(f.name).`type`(fieldAvroType).withDefault(defaultObject) | ||
case Some(_) => | ||
fieldsAssembler.name(f.name).`type`(fieldAvroType).withDefault(null) | ||
case _ => fieldsAssembler.name(f.name).`type`(fieldAvroType).noDefault() | ||
} | ||
} | ||
fieldsAssembler.endRecord() | ||
|
||
// nullability is handled later in this method, thus pass nullable = false | ||
case _ => SchemaConverters.toAvroType(catalystType, nullable = false, recordName, nameSpace) | ||
} | ||
if (nullable) { | ||
defaultValue match { | ||
case Some(value) if !value.isInstanceOf[JsonProperties.Null] => Schema.createUnion(schema, nullSchema) | ||
case _ => Schema.createUnion(nullSchema, schema) | ||
} | ||
} else { | ||
schema | ||
} | ||
} | ||
|
||
// scalastyle:on | ||
private def getDecimalFixedType(d: DecimalType, avroSchema: Option[Schema], nameSpace: String, recordName: String) = { | ||
val avroType = LogicalTypes.decimal(d.precision, d.scale) | ||
val name = getFixedName(recordName, nameSpace) | ||
val minBytes = minBytesForPrecision(d.precision) | ||
val size = avroSchema.map { schema => | ||
if (schema.getFixedSize > minBytes) schema.getFixedSize else minBytes | ||
}.getOrElse { | ||
minBytes | ||
} | ||
avroType.addToSchema(SchemaBuilder.fixed(name).size(size)) | ||
} | ||
|
||
private def getFixedName(recordName: String, nameSpace: String) = { | ||
// Need to avoid naming conflict for the fixed fields | ||
nameSpace match { | ||
case "" => s"$recordName.fixed" | ||
case _ => s"$nameSpace.$recordName.fixed" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.