forked from mapr/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MapR [SPARK-619] Move absent commits from 2.4.3 branch to 2.4.4 (apac…
…he#574) * Adding SQL API to write to kafka from Spark (apache#567) * Branch 2.4.3 extended kafka and examples (apache#569) * The v2 API is in its own package - the v2 api is in a different package - the old functionality is available in a separated package * v2 API examples - All the examples are using the newest API. - I have removed the old examples since they are not relevant any more and the same functionality is shown in the new examples usin the new API. * MapR [SPARK-619] Move absent commits from 2.4.3 branch to 2.4.4
- Loading branch information
1 parent
ca0c9c4
commit 2827bed
Showing
11 changed files
with
294 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
...-producer/src/main/scala/org/apache/spark/streaming/kafka/producer/sql/CommittedIds.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
package org.apache.spark.streaming.kafka.producer.sql | ||
|
||
import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage | ||
|
||
private case class CommittedIds(partitionId: Int, ids: Set[String]) extends WriterCommitMessage |
38 changes: 38 additions & 0 deletions
38
.../src/main/scala/org/apache/spark/streaming/kafka/producer/sql/KafkaDataSourceWriter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package org.apache.spark.streaming.kafka.producer.sql | ||
|
||
import java.util.concurrent.Future | ||
|
||
import org.apache.spark.streaming.kafka.producer.sql.CommittedIds | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.Row | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage} | ||
import org.apache.spark.sql.types.{DataType, StringType, StructType} | ||
import org.apache.spark.streaming.kafka.producer.ProducerConf | ||
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata} | ||
|
||
|
||
private class KafkaDataSourceWriter(topic: String, schema: StructType) extends DataSourceWriter with Logging { | ||
|
||
private var globallyCommittedIds = List.empty[String] | ||
|
||
override def createWriterFactory(): DataWriterFactory[InternalRow] = new KafkaDataWriterFactory(topic, schema) | ||
|
||
override def commit(messages: Array[WriterCommitMessage]): Unit = { | ||
|
||
val ids = messages.foldLeft(Set.empty[String]) { case (acc, CommittedIds(partitionId, partitionIds)) => | ||
log.info(s"PARTITION $partitionId HAS BEEN CONFIRMED BY DRIVER") | ||
|
||
acc ++ partitionIds | ||
} | ||
|
||
// Let's make sure this is thread-safe | ||
globallyCommittedIds = this.synchronized { | ||
globallyCommittedIds ++ ids | ||
} | ||
} | ||
|
||
override def abort(messages: Array[WriterCommitMessage]): Unit = { | ||
log.info("JOB BEING ABORTED") | ||
} | ||
} |
74 changes: 74 additions & 0 deletions
74
...src/main/scala/org/apache/spark/streaming/kafka/producer/sql/KafkaDataWriterFactory.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package org.apache.spark.streaming.kafka.producer.sql | ||
|
||
import java.util.concurrent.Future | ||
import java.util.concurrent.Future | ||
|
||
import org.apache.spark.streaming.kafka.producer.sql.CommittedIds | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.Row | ||
import org.apache.spark.sql.catalyst.InternalRow | ||
import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage} | ||
import org.apache.spark.sql.types.{DataType, StringType, StructType} | ||
import org.apache.spark.streaming.kafka.producer.ProducerConf | ||
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata} | ||
|
||
import scala.util.parsing.json.{JSONArray, JSONObject} | ||
|
||
private class KafkaDataWriterFactory(topic: String, schema: StructType) extends DataWriterFactory[InternalRow] { | ||
|
||
@transient private lazy val producerConf = new ProducerConf( | ||
bootstrapServers = "".split(",").toList) | ||
|
||
@transient private lazy val producer = new KafkaProducer[String, String](producerConf.asJMap()) | ||
|
||
override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = new DataWriter[InternalRow] with Logging { | ||
|
||
private val writtenIds = scala.collection.mutable.ListBuffer.empty[Future[RecordMetadata]] | ||
|
||
log.info(s"PROCESSING PARTITION ID: $partitionId ; TASK ID: $taskId") | ||
|
||
override def write(record: InternalRow): Unit = { | ||
val data = record.toSeq(schema).toList | ||
|
||
val map = schema.fields.zipWithIndex | ||
.map { case (field, idx) => (field.name, data(idx)) } | ||
.toMap | ||
|
||
val json = toJson(map) | ||
|
||
val task = producer.send(new ProducerRecord(topic, json.toString)) | ||
|
||
writtenIds.append(task) | ||
|
||
} | ||
|
||
|
||
override def commit(): WriterCommitMessage = { | ||
val meta = writtenIds.map(_.get()) | ||
|
||
writtenIds.clear() | ||
CommittedIds(partitionId, meta.map(_.offset().toString).toSet) | ||
} | ||
|
||
override def abort(): Unit = writtenIds.map(_.cancel(true)) | ||
|
||
private def toJson(arr: List[Any]): JSONArray = { | ||
JSONArray(arr.map { | ||
case (innerMap: Map[String, Any]) => toJson(innerMap) | ||
case (innerArray: List[Any]) => toJson(innerArray) | ||
case (other) => other | ||
}) | ||
} | ||
|
||
private def toJson(map: Map[String, Any]): JSONObject = { | ||
JSONObject(map.map { | ||
case (key, innerMap: Map[String, Any]) => | ||
(key, toJson(innerMap)) | ||
case (key, innerArray: List[Any]) => | ||
(key, toJson(innerArray)) | ||
case (key, other) => | ||
(key, other) | ||
}) | ||
} | ||
} | ||
} |
18 changes: 18 additions & 0 deletions
18
...a-producer/src/main/scala/org/apache/spark/streaming/kafka/producer/sql/KafkaWriter.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package org.apache.spark.streaming.kafka.producer.sql | ||
|
||
import java.util.Optional | ||
|
||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.SaveMode | ||
import org.apache.spark.sql.sources.v2.writer.DataSourceWriter | ||
import org.apache.spark.sql.sources.v2.{DataSourceOptions, WriteSupport} | ||
import org.apache.spark.sql.types.StructType | ||
|
||
class KafkaWriter extends WriteSupport with Logging { | ||
override def createWriter(writeUUID: String, schema: StructType, mode: SaveMode, options: DataSourceOptions): Optional[DataSourceWriter] = { | ||
|
||
val stream = options.get("path").get() | ||
|
||
java.util.Optional.of(new KafkaDataSourceWriter(stream, schema)) | ||
} | ||
} |
Oops, something went wrong.