Skip to content

Commit

Permalink
MapR [SPARK-619] Move absent commits from 2.4.3 branch to 2.4.4 (apac…
Browse files Browse the repository at this point in the history
…he#574)

* Adding SQL API to write to kafka from Spark (apache#567)

* Branch 2.4.3 extended kafka and examples (apache#569)

* The v2 API is in its own package

- the v2 api is in a different package
- the old functionality is available in a separated package

* v2 API examples

- All the examples are using the newest API.
- I have removed the old examples since they are not relevant any more and the same functionality is shown in the new examples usin the new API.

* MapR [SPARK-619] Move absent commits from 2.4.3 branch to 2.4.4
  • Loading branch information
ekrivokonmapr authored Oct 7, 2019
1 parent ca0c9c4 commit 2827bed
Show file tree
Hide file tree
Showing 11 changed files with 294 additions and 51 deletions.
2 changes: 1 addition & 1 deletion build/dev-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ fi

if [ $? -ne 0 ]; then exit 1; fi

scp -r assembly/target/scala-2.11/jars mapr@node1:/opt/mapr/spark/spark-2.0.1/jars
scp -r assembly/target/scala-2.11/jars mapr@node1:/opt/mapr/spark/spark-2.4.4/jars
if [ $? -ne 0 ]; then exit 1; fi
Original file line number Diff line number Diff line change
Expand Up @@ -17,78 +17,83 @@

package org.apache.spark.examples.streaming

import java.util.{ Map => JMap }
import java.util.{Map => JMap}

import org.apache.kafka.common.serialization.Serializer

import org.apache.spark.SparkConf

import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{ConstantInputDStream, DStream}
import org.apache.spark.streaming.kafka.v2.producer._
import org.apache.spark.sql.{DataFrame, SparkSession, Row}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType, LongType}

class ItemJsonSerializer extends Serializer[Item] {
override def configure(configs: JMap[String, _], isKey: Boolean): Unit = { /* NOP */ }

override def serialize(topic: String, data: Item): Array[Byte] = data.toString.getBytes

override def close(): Unit = { /* NOP */ }
}

case class Item(id: Int, value: Int) {
override def toString: String = s"""{"id":"$id","value":"$value"}"""
}

/**
* Produces messages to Kafka.
* Usage: KafkaProducerExample <kafkaBrokers> <topics> <numMessages>
* <kafkaBrokers> is a list of one or more kafka brokers
* <topics> is a list of one or more kafka topics
* <numMessages> is the number of messages that the kafka producer should send
*
* Example:
* `$ bin/run-example \
* org.apache.spark.examples.streaming.KafkaProducerExample broker1,broker2 \
* topic1,topic2 10`
*/

// scalastyle:off println
object KafkaProducerExample extends App {
import org.apache.spark.streaming.kafka.producer._
object KakfaProducerExample extends App {

if (args.length < 3) {
if (args.length < 2) {
System.err.println(s"""
|Usage: Usage: KafkaProducerExample <kafkaBrokers> <topics> <numMessages>
| <kafkaBrokers> is a list of one or more kafka brokers
|Usage: Usage: KafkaProducerExample <topics> <numMessages>
| <topics> is a list of one or more kafka topics
| <numMessages> is the number of messages that the kafka producer
| should send
""".stripMargin)
System.exit(1)
}

val Array(kafkaBrokers, topics, numMessages) = args

val batchTime = Seconds(2)
val Array(topics, numMessages) = args

val sparkConf = new SparkConf()
.set("spark.executor.memory", "1g")
.set("spark.driver.memory", "1g")
.setAppName(getClass.getCanonicalName)

implicit val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()

val items = (0 until numMessages.toInt).map(_.toString)

val stringRDD: RDD[String] = sparkSession.sparkContext.parallelize(items)

// if we have RDD[String] we can write to kafka using the new API V2

stringRDD.sendToKafka(topics)

val rnd = new Random()

// create RDD of Rows
val anotherRDD = stringRDD.map(s => Row(s, s.length, rnd.nextLong()))

val schema = new StructType()
.add(StructField("value", StringType))
.add(StructField("length", IntegerType))
.add(StructField("some_long", LongType))

// create a dataframe with some schema
val dataFrame: DataFrame = sparkSession.createDataFrame(anotherRDD, schema)

// any data frame can be easily written to Kafka
dataFrame.sendToKafka(topics)

val intRDD: RDD[(Int, Int)] = sparkSession.sparkContext.parallelize(0 until numMessages.toInt).map(n => (n, n.toString.length))

val transformer = (v: (Int, Int)) => Row(v._1, v._2)

// given an RDD[A], a function A => Row and a schema, we can write to kafka easily
intRDD.sendToKafka(topics, transformer, new StructType().add(StructField("value", IntegerType)).add(StructField("length", IntegerType)))

val batchTime = Seconds(2)
val ssc = new StreamingContext(sparkConf, batchTime)

val producerConf = new ProducerConf(bootstrapServers = kafkaBrokers.split(",").toList)
.withKeySerializer("org.apache.kafka.common.serialization.ByteArraySerializer")
.withValueSerializer("org.apache.kafka.common.serialization.StringSerializer")
val stringStream: DStream[String] = new ConstantInputDStream[String](ssc, stringRDD)

stringStream.sendToKafka(topics)

val items = (0 until numMessages.toInt).map(i => Item(i, i).toString)
val defaultRDD: RDD[String] = ssc.sparkContext.parallelize(items)
val dStream: DStream[String] = new ConstantInputDStream[String](ssc, defaultRDD)
val someStream = new ConstantInputDStream[(Int, Int)](ssc, intRDD)

dStream.foreachRDD(_.sendToKafka(topics, producerConf))
dStream.count().print()
someStream.sendToKafka(topics, transformer, new StructType().add(StructField("value", IntegerType)).add(StructField("length", IntegerType)))

ssc.start()
ssc.awaitTermination()

ssc.stop(stopSparkContext = true, stopGracefully = true)
}
}
5 changes: 5 additions & 0 deletions external/kafka-producer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_${scala.binary.version}</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import scala.language.implicitConversions
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

class RDDFunctions[T](rdd: RDD[T]) {
def sendToKafka(topic: String, conf: ProducerConf): Unit = {
Expand All @@ -43,4 +44,4 @@ class PairRDDFunctions[K, V](rdd: RDD[(K, V)]) {
}
})
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
package org.apache.spark.streaming.kafka

import scala.language.implicitConversions

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession, Row}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

package object producer {
implicit def toRDDFunctions[T](rdd: RDD[T]): RDDFunctions[T] =
new RDDFunctions[T](rdd)

implicit def toPairRDDFunctions[K, V](rdd: RDD[(K, V)]):
PairRDDFunctions[K, V] = new PairRDDFunctions[K, V](rdd)

PairRDDFunctions[K, V] = new PairRDDFunctions[K, V](rdd)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package org.apache.spark.streaming.kafka.producer.sql

import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage

private case class CommittedIds(partitionId: Int, ids: Set[String]) extends WriterCommitMessage
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.apache.spark.streaming.kafka.producer.sql

import java.util.concurrent.Future

import org.apache.spark.streaming.kafka.producer.sql.CommittedIds
import org.apache.spark.internal.Logging
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.types.{DataType, StringType, StructType}
import org.apache.spark.streaming.kafka.producer.ProducerConf
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}


private class KafkaDataSourceWriter(topic: String, schema: StructType) extends DataSourceWriter with Logging {

private var globallyCommittedIds = List.empty[String]

override def createWriterFactory(): DataWriterFactory[InternalRow] = new KafkaDataWriterFactory(topic, schema)

override def commit(messages: Array[WriterCommitMessage]): Unit = {

val ids = messages.foldLeft(Set.empty[String]) { case (acc, CommittedIds(partitionId, partitionIds)) =>
log.info(s"PARTITION $partitionId HAS BEEN CONFIRMED BY DRIVER")

acc ++ partitionIds
}

// Let's make sure this is thread-safe
globallyCommittedIds = this.synchronized {
globallyCommittedIds ++ ids
}
}

override def abort(messages: Array[WriterCommitMessage]): Unit = {
log.info("JOB BEING ABORTED")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package org.apache.spark.streaming.kafka.producer.sql

import java.util.concurrent.Future
import java.util.concurrent.Future

import org.apache.spark.streaming.kafka.producer.sql.CommittedIds
import org.apache.spark.internal.Logging
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.types.{DataType, StringType, StructType}
import org.apache.spark.streaming.kafka.producer.ProducerConf
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}

import scala.util.parsing.json.{JSONArray, JSONObject}

private class KafkaDataWriterFactory(topic: String, schema: StructType) extends DataWriterFactory[InternalRow] {

@transient private lazy val producerConf = new ProducerConf(
bootstrapServers = "".split(",").toList)

@transient private lazy val producer = new KafkaProducer[String, String](producerConf.asJMap())

override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = new DataWriter[InternalRow] with Logging {

private val writtenIds = scala.collection.mutable.ListBuffer.empty[Future[RecordMetadata]]

log.info(s"PROCESSING PARTITION ID: $partitionId ; TASK ID: $taskId")

override def write(record: InternalRow): Unit = {
val data = record.toSeq(schema).toList

val map = schema.fields.zipWithIndex
.map { case (field, idx) => (field.name, data(idx)) }
.toMap

val json = toJson(map)

val task = producer.send(new ProducerRecord(topic, json.toString))

writtenIds.append(task)

}


override def commit(): WriterCommitMessage = {
val meta = writtenIds.map(_.get())

writtenIds.clear()
CommittedIds(partitionId, meta.map(_.offset().toString).toSet)
}

override def abort(): Unit = writtenIds.map(_.cancel(true))

private def toJson(arr: List[Any]): JSONArray = {
JSONArray(arr.map {
case (innerMap: Map[String, Any]) => toJson(innerMap)
case (innerArray: List[Any]) => toJson(innerArray)
case (other) => other
})
}

private def toJson(map: Map[String, Any]): JSONObject = {
JSONObject(map.map {
case (key, innerMap: Map[String, Any]) =>
(key, toJson(innerMap))
case (key, innerArray: List[Any]) =>
(key, toJson(innerArray))
case (key, other) =>
(key, other)
})
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package org.apache.spark.streaming.kafka.producer.sql

import java.util.Optional

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.sources.v2.writer.DataSourceWriter
import org.apache.spark.sql.sources.v2.{DataSourceOptions, WriteSupport}
import org.apache.spark.sql.types.StructType

class KafkaWriter extends WriteSupport with Logging {
override def createWriter(writeUUID: String, schema: StructType, mode: SaveMode, options: DataSourceOptions): Optional[DataSourceWriter] = {

val stream = options.get("path").get()

java.util.Optional.of(new KafkaDataSourceWriter(stream, schema))
}
}
Loading

0 comments on commit 2827bed

Please sign in to comment.