diff --git a/README.md b/README.md index 6489036..36dc54a 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,22 @@ To build this plugin: The build will create a .jar and .json file under the ``target`` directory. These files can be used to deploy your plugins. +Run tests +----- +To run all tests: + +``` + mvn clean test +``` + +System properties required for tests: +**test.kafka_server** - Kafka broker instance address. +**test.cluster_api_key** - Confluent API key. +**test.cluster_api_secret** - Confluent API secret. +**test.schema_registry_url** - Schema Registry URL. +**test.schema_registry_api_key** - Schema Registry API key. +**test.schema_registry_api_secret** - Schema Registry API secret. + Deployment ---------- You can deploy your plugins using the CDAP CLI: diff --git a/confluent-kafka-plugins/docs/Confluent-sparksink.md b/confluent-kafka-plugins/docs/Confluent-sparksink.md new file mode 100644 index 0000000..11da919 --- /dev/null +++ b/confluent-kafka-plugins/docs/Confluent-sparksink.md @@ -0,0 +1,77 @@ +# Confluent Streaming Sink + + +Description +----------- +This sink writes data to Confluent. +Sends message to specified Kafka topic per received record. It can also be +configured to partition events being written to kafka based on a configurable key. +The sink can also be configured to operate in sync or async mode and apply different +compression types to events. +Can be used with self-managed Confluent Platform or Confluent Cloud. Supports Schema Registry. + + +Properties +---------- +**Reference Name:** This will be used to uniquely identify this source for lineage, annotating metadata, etc. + +**Kafka Brokers:** List of Kafka brokers specified in host1:port1,host2:port2 form. (Macro-enabled) + +**Kafka Topic:** The Kafka topic to read from. (Macro-enabled) + +**Async:** Specifies whether an acknowledgment is required from broker that message was received. Default is No. + +**Compression Type:** Compression type to be applied on message. + +**Time Field:** Optional name of the field containing the read time of the message. +If this is not set, message will be send with current timestamp. +If set, this field must be present in the input schema and must be a long. + +**Key Field:** Optional name of the field containing the message key. +If this is not set, message will be send without a key. +If set, this field must be present in the schema property and must be of type bytes. + +**Partition Field:** Optional name of the field containing the partition the message should be written to. +If this is not set, default partition will be used for all messages. +If set, this field must be present in the schema property and must be an int. + +**Message Format:** Optional format a structured record should be converted to. + Required if used without Schema Registry. + +**Additional Kafka Producer Properties:** Additional Kafka producer properties to set. + +**Cluster API Key:** The Confluent API Key used for the source. + +**Cluster API Secret:** The Confluent API Secret used for the source. + +**Schema Registry URL:** The Schema Registry endpoint URL. + +**Schema Registry API Key:** The Schema Registry API Key. + +**Schema Registry API Secret:** The Schema Registry API Secret. + +Example +------- +This example writes structured record to kafka topic 'alarm' in asynchronous manner +using compression type 'gzip'. The written events will be written in csv format +to kafka running at localhost. The Kafka partition will be decided based on the provided key 'ts'. +Additional properties like number of acknowledgements and client id can also be provided. + +```json +{ + "name": "Confluent", + "type": "batchsink", + "properties": { + "referenceName": "Kafka", + "brokers": "host1.example.com:9092,host2.example.com:9092", + "topic": "alarm", + "async": "true", + "compressionType": "gzip", + "format": "CSV", + "kafkaProperties": "acks:2,client.id:myclient", + "key": "message", + "clusterApiKey": "", + "clusterApiSecret": "" + } +} +``` diff --git a/confluent-kafka-plugins/docs/Confluent-streamingsource.md b/confluent-kafka-plugins/docs/Confluent-streamingsource.md new file mode 100644 index 0000000..b3253e1 --- /dev/null +++ b/confluent-kafka-plugins/docs/Confluent-streamingsource.md @@ -0,0 +1,132 @@ +# Confluent Streaming Source + + +Description +----------- +This source reads data from Confluent. +Emits a record per message from specified Kafka topic. +Can be used with self-managed Confluent Platform or Confluent Cloud. Supports Schema Registry. + +Can be configured to parse values from source in following ways: +1. User-defined format. Use **Message Format** field to choose any format supported by CDAP. +1. Schema Registry. Requires credentials for Schema Registry to be specified. +Uses Avro schemas to deserialize Kafka messages. Use **Get Schema** button to fetch key and value schemas from registry. +1. Binary format. Used in case if no message format or Schema Registry credentials were provided. + + +Properties +---------- +**Reference Name:** This will be used to uniquely identify this source for lineage, annotating metadata, etc. + +**Kafka Brokers:** List of Kafka brokers specified in host1:port1,host2:port2 form. (Macro-enabled) + +**Kafka Topic:** The Kafka topic to read from. (Macro-enabled) + +**Topic Partitions:** List of topic partitions to read from. If not specified, all partitions will be read. (Macro-enabled) + +**Default Initial Offset:** The default initial offset for all topic partitions. +An offset of -2 means the smallest offset. An offset of -1 means the latest offset. Defaults to -1. +Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read. +If you wish to set different initial offsets for different partitions, use the initialPartitionOffsets property. (Macro-enabled) + +**Initial Partition Offsets:** The initial offset for each topic partition. If this is not specified, +all partitions will use the same initial offset, which is determined by the defaultInitialOffset property. +Any partitions specified in the partitions property, but not in this property will use the defaultInitialOffset. +An offset of -2 means the smallest offset. An offset of -1 means the latest offset. +Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read. (Macro-enabled) + +**Time Field:** Optional name of the field containing the read time of the batch. +If this is not set, no time field will be added to output records. +If set, this field must be present in the schema property and must be a long. + +**Key Field:** Optional name of the field containing the message key. +If this is not set, no key field will be added to output records. +If set, this field must be present in the schema property and must be bytes. + +**Partition Field:** Optional name of the field containing the partition the message was read from. +If this is not set, no partition field will be added to output records. +If set, this field must be present in the schema property and must be an int. + +**Offset Field:** Optional name of the field containing the partition offset the message was read from. +If this is not set, no offset field will be added to output records. +If set, this field must be present in the schema property and must be a long. + +**Message Format:** Optional format of the Kafka event message. Any format supported by CDAP is supported. +For example, a value of 'csv' will attempt to parse Kafka payloads as comma-separated values. +If no format is given, Kafka message payloads will be treated as bytes. + +**Max Rate Per Partition:** Maximum number of records to read per second per partition. Defaults to 1000. + +**Additional Kafka Consumer Properties:** Additional Kafka consumer properties to set. + +**Cluster API Key:** The Confluent API Key used for the source. + +**Cluster API Secret:** The Confluent API Secret used for the source. + +**Schema Registry URL:** The Schema Registry endpoint URL. + +**Schema Registry API Key:** The Schema Registry API Key. + +**Schema Registry API Secret:** The Schema Registry API Secret. + +**Value Field:** The name of the field containing the message value. Required to fetch schema from Schema Registry. + +**Schema:** Output schema of the source. If you would like the output records to contain a field with the +Kafka message key, the schema must include a field of type bytes/nullable bytes or string/nullable string, and you must +set the **Key Field** property to that field's name. Similarly, if you would like the output records to contain a field +with the timestamp of when the record was read, the schema must include a field of type long or nullable long, and you +must set the **Time Field** property to that field's name. Any field that is not the **Time Field** or **Key Field** +will be used in conjunction with the format to parse Kafka message payloads. If used with Schema Registry then should +be fetched using **Get Schema** button. + +Example +------- +***Example 1:*** Read from the 'purchases' topic of a Kafka instance running +on brokers host1.example.com:9092 and host2.example.com:9092. The source will add +a time field named 'readTime' that contains a timestamp corresponding to the micro +batch when the record was read. It will also contain a field named 'key' which will have +the message key in it. It parses the Kafka messages using the 'csv' format +with 'user', 'item', 'count', and 'price' as the message schema. + +```json +{ + "name": "Confluent", + "type": "streamingsource", + "properties": { + "topics": "purchases", + "brokers": "host1.example.com:9092,host2.example.com:9092", + "format": "csv", + "timeField": "readTime", + "keyField": "key", + "clusterApiKey": "", + "clusterApiSecret": "", + "defaultInitialOffset": "-2", + "schema": "{ + \"type\":\"record\", + \"name\":\"purchase\", + \"fields\":[ + {\"name\":\"readTime\",\"type\":\"long\"}, + {\"name\":\"key\",\"type\":\"bytes\"}, + {\"name\":\"user\",\"type\":\"string\"}, + {\"name\":\"item\",\"type\":\"string\"}, + {\"name\":\"count\",\"type\":\"int\"}, + {\"name\":\"price\",\"type\":\"double\"} + ] + }" + } +} +``` + +For each Kafka message read, it will output a record with the schema: + +| field name | type | +| ----------- | ---------------- | +| readTime | long | +| key | bytes | +| user | string | +| item | string | +| count | int | +| price | double | + +Note that the readTime field is not derived from the Kafka message, but from the time that the +message was read. diff --git a/confluent-kafka-plugins/icons/Confluent-sparksink.png b/confluent-kafka-plugins/icons/Confluent-sparksink.png new file mode 100644 index 0000000..bc1949b Binary files /dev/null and b/confluent-kafka-plugins/icons/Confluent-sparksink.png differ diff --git a/confluent-kafka-plugins/icons/Confluent-streamingsource.png b/confluent-kafka-plugins/icons/Confluent-streamingsource.png new file mode 100644 index 0000000..bc1949b Binary files /dev/null and b/confluent-kafka-plugins/icons/Confluent-streamingsource.png differ diff --git a/confluent-kafka-plugins/pom.xml b/confluent-kafka-plugins/pom.xml new file mode 100644 index 0000000..e3cdc41 --- /dev/null +++ b/confluent-kafka-plugins/pom.xml @@ -0,0 +1,252 @@ + + + + kafka-plugins + io.cdap.plugin + 2.3.0-SNAPSHOT + + 4.0.0 + + Confluent Kafka plugins + confluent-kafka-plugins + + + 3.12.0 + 3.1.6 + 5.3.1 + + 2.3.0 + + + + + confluent + http://packages.confluent.io/maven/ + + + + + + io.cdap.plugin + kafka-plugins-common + ${project.parent.version} + + + io.cdap.plugin + format-avro + ${cdap.plugin.version} + + + org.apache.avro + avro + ${avro.version} + + + io.confluent + kafka-avro-serializer + ${confluent.version} + + + io.confluent + kafka-schema-registry-client + ${confluent.version} + + + org.apache.kafka + kafka_2.11 + ${kafka10.version} + + + org.slf4j + slf4j-log4j12 + + + + + org.apache.spark + spark-streaming-kafka-0-10_2.11 + ${spark2.version} + + + org.apache.kafka + kafka_2.11 + + + org.apache.spark + spark-tags_2.11 + + + + + org.apache.spark + spark-mllib_2.11 + ${spark2.version} + provided + + + org.apache.xbean + xbean-asm5-shaded + + + + + org.apache.spark + spark-streaming_2.11 + ${spark2.version} + provided + + + org.apache.spark + spark-core_2.11 + ${spark2.version} + provided + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + + org.apache.hadoop + hadoop-client + + + com.esotericsoftware.reflectasm + reflectasm + + + org.apache.curator + curator-recipes + + + org.tachyonproject + tachyon-client + + + org.scala-lang + scala-compiler + + + org.eclipse.jetty.orbit + javax.servlet + + + + net.java.dev.jets3t + jets3t + + + org.apache.xbean + xbean-asm5-shaded + + + + + io.cdap.cdap + cdap-spark-core2_2.11 + ${cdap.version} + test + + + io.cdap.cdap + cdap-data-pipeline2_2.11 + ${cdap.version} + test + + + io.cdap.cdap + cdap-data-streams2_2.11 + ${cdap.version} + test + + + io.cdap.plugin + kafka-plugins-common + ${project.parent.version} + test-jar + test + + + org.assertj + assertj-core + ${assertj.version} + test + + + org.awaitility + awaitility + ${awaitility.version} + + + io.netty + netty-all + ${netty.version} + + + io.cdap.http + netty-http + ${netty-http.version} + + + + + + + org.apache.felix + maven-bundle-plugin + 3.3.0 + + + <_exportcontents> + io.cdap.plugin.confluent.*; + org.apache.spark.streaming.kafka010.*; + org.apache.kafka.common.*; + org.apache.kafka.common.serialization.*; + io.confluent.kafka.serializers.*; + + *;inline=false;scope=compile + true + lib + + + + + package + + bundle + + + + + + io.cdap + cdap-maven-plugin + 1.1.0 + + + system:cdap-data-pipeline[6.1.0-SNAPSHOT,7.0.0-SNAPSHOT) + system:cdap-data-streams[6.1.0-SNAPSHOT,7.0.0-SNAPSHOT) + + + + + create-artifact-config + prepare-package + + create-plugin-json + + + + + + + + diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/common/ConfigValidations.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/common/ConfigValidations.java new file mode 100644 index 0000000..f8a4a9f --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/common/ConfigValidations.java @@ -0,0 +1,58 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.common; + +import io.cdap.cdap.api.dataset.lib.KeyValue; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.plugin.common.KeyValueListParser; + +import java.util.HashMap; +import java.util.Map; + +/** + * Utility class for config validation + */ +public class ConfigValidations { + private ConfigValidations() { + throw new AssertionError("Should not be initialized"); + } + + public static void validateBrokers(String brokers, String propertyName, FailureCollector collector) { + Map brokerMap = new HashMap<>(); + try { + Iterable> parsed = KeyValueListParser.DEFAULT.parse(brokers); + for (KeyValue hostAndPort : parsed) { + String host = hostAndPort.getKey(); + String portStr = hostAndPort.getValue(); + try { + brokerMap.put(host, Integer.parseInt(portStr)); + } catch (NumberFormatException e) { + collector.addFailure(String.format("Invalid port '%s' for host '%s'.", portStr, host), + "It should be a valid port number.") + .withConfigElement(propertyName, host + ":" + portStr); + } + } + } catch (IllegalArgumentException e) { + // no-op + } + + if (brokerMap.isEmpty()) { + collector.addFailure("Kafka brokers must be provided in host:port format.", null) + .withConfigProperty(propertyName); + } + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/common/KafkaHelpers.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/common/KafkaHelpers.java new file mode 100644 index 0000000..ed7b36d --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/common/KafkaHelpers.java @@ -0,0 +1,74 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.common; + +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.common.TopicPartition; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Utility class for Kafka operations + */ +public final class KafkaHelpers { + + // This class cannot be instantiated + private KafkaHelpers() { + } + + /** + * Fetch the latest offsets for the given topic-partitions + * + * @param consumer The Kafka consumer + * @param topicAndPartitions topic-partitions to fetch the offsets for + * @return Mapping of topic-partiton to its latest offset + */ + public static Map getLatestOffsets(Consumer consumer, + List topicAndPartitions) { + consumer.assign(topicAndPartitions); + consumer.seekToEnd(topicAndPartitions); + + Map offsets = new HashMap<>(); + for (TopicPartition topicAndPartition : topicAndPartitions) { + long offset = consumer.position(topicAndPartition); + offsets.put(topicAndPartition, offset); + } + return offsets; + } + + /** + * Fetch the earliest offsets for the given topic-partitions + * + * @param consumer The Kafka consumer + * @param topicAndPartitions topic-partitions to fetch the offsets for + * @return Mapping of topic-partiton to its earliest offset + */ + public static Map getEarliestOffsets(Consumer consumer, + List topicAndPartitions) { + consumer.assign(topicAndPartitions); + consumer.seekToBeginning(topicAndPartitions); + + Map offsets = new HashMap<>(); + for (TopicPartition topicAndPartition : topicAndPartitions) { + long offset = consumer.position(topicAndPartition); + offsets.put(topicAndPartition, offset); + } + return offsets; + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSink.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSink.java new file mode 100644 index 0000000..f79b2c5 --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSink.java @@ -0,0 +1,89 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.sink; + +import io.cdap.cdap.api.annotation.Description; +import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.annotation.Plugin; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.cdap.etl.api.PipelineConfigurer; +import io.cdap.cdap.etl.api.StageConfigurer; +import io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext; +import io.cdap.cdap.etl.api.batch.SparkPluginContext; +import io.cdap.cdap.etl.api.batch.SparkSink; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.Producer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.spark.api.java.JavaRDD; + +import java.util.Map; +import java.util.Objects; + +/** + * Confluent Kafka Streaming sink. + */ +@Plugin(type = SparkSink.PLUGIN_TYPE) +@Name(ConfluentStreamingSink.PLUGIN_NAME) +@Description("Confluent Kafka streaming sink.") +public class ConfluentStreamingSink extends SparkSink { + public static final String PLUGIN_NAME = "Confluent"; + + private final ConfluentStreamingSinkConfig conf; + + public ConfluentStreamingSink(ConfluentStreamingSinkConfig conf) { + this.conf = conf; + } + + @Override + public void configurePipeline(PipelineConfigurer pipelineConfigurer) { + StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer(); + Schema inputSchema = stageConfigurer.getInputSchema(); + FailureCollector failureCollector = stageConfigurer.getFailureCollector(); + conf.validate(inputSchema, failureCollector); + failureCollector.getOrThrowException(); + } + + @Override + public void prepareRun(SparkPluginContext sparkPluginContext) { + Schema inputSchema = sparkPluginContext.getInputSchema(); + FailureCollector failureCollector = sparkPluginContext.getFailureCollector(); + conf.validate(inputSchema, failureCollector); + failureCollector.getOrThrowException(); + } + + @Override + public void run(SparkExecutionPluginContext context, JavaRDD javaRDD) { + Map producerParams = ConfluentStreamingSinkUtil.getProducerParams(conf, context.getPipelineName()); + Schema inputSchema = Objects.requireNonNull(context.getInputSchema()); + Schema outputSchema = conf.getMessageSchema(inputSchema); + StructuredToProducerRecordTransformer transformer = new StructuredToProducerRecordTransformer(conf, outputSchema); + javaRDD.foreachPartition(structuredRecordIterator -> { + try (Producer producer = new KafkaProducer<>(producerParams)) { + while (structuredRecordIterator.hasNext()) { + StructuredRecord input = structuredRecordIterator.next(); + ProducerRecord record = transformer.transform(input); + producer.send(record); + if (!conf.getAsync()) { + producer.flush(); + } + } + } + }); + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSinkConfig.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSinkConfig.java new file mode 100644 index 0000000..ce0e4b4 --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSinkConfig.java @@ -0,0 +1,397 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.sink; + +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableSet; +import io.cdap.cdap.api.annotation.Description; +import io.cdap.cdap.api.annotation.Macro; +import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.api.dataset.lib.KeyValue; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.plugin.common.IdUtils; +import io.cdap.plugin.common.KeyValueListParser; +import io.cdap.plugin.common.ReferencePluginConfig; +import io.cdap.plugin.confluent.common.ConfigValidations; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nullable; + +/** + * Conf for Confluent Kafka streaming sink. + */ +@SuppressWarnings("unused") +public class ConfluentStreamingSinkConfig extends ReferencePluginConfig implements Serializable { + public static final String NAME_BROKERS = "brokers"; + public static final String NAME_ASYNC = "async"; + public static final String NAME_TIME_FIELD = "timeField"; + public static final String NAME_KEY_FIELD = "keyField"; + public static final String NAME_PARTITION_FIELD = "partitionField"; + public static final String NAME_TOPIC = "topic"; + public static final String NAME_CLUSTER_API_KEY = "clusterApiKey"; + public static final String NAME_CLUSTER_API_SECRET = "clusterApiSecret"; + public static final String NAME_SR_URL = "schemaRegistryUrl"; + public static final String NAME_SR_API_KEY = "schemaRegistryApiKey"; + public static final String NAME_SR_API_SECRET = "schemaRegistryApiSecret"; + public static final String NAME_FORMAT = "format"; + public static final String NAME_COMPRESSION_TYPE = "compressionType"; + public static final String NAME_KAFKA_PROPERTIES = "kafkaProperties"; + + public static final Set SUPPORTED_FORMATS = ImmutableSet.of("csv", "json"); + + @Name(NAME_BROKERS) + @Description("Specifies the connection string where Producer can find one or more brokers to " + + "determine the leader for each topic") + @Macro + private final String brokers; + + @Name(NAME_ASYNC) + @Description("Specifies whether an acknowledgment is required from broker that message was received. " + + "Default is FALSE") + @Macro + private final Boolean async; + + @Name(NAME_TIME_FIELD) + @Description("Optional name of the field containing the read time of the message. " + + "If this is not set, message will be send with current timestamp. " + + "If set, this field must be present in the input schema and must be a long.") + @Nullable + private final String timeField; + + @Name(NAME_KEY_FIELD) + @Description("Specify the key field to be used in the message. Only String Partitioner is supported.") + @Macro + @Nullable + private final String keyField; + + @Name(NAME_PARTITION_FIELD) + @Description("Optional name of the field containing the partition the message should be written to.\n" + + "If this is not set, default partition will be used for all messages.\n" + + "If set, this field must be present in the schema property and must be an int.") + @Nullable + private final String partitionField; + + @Name(NAME_TOPIC) + @Description("Topic to which message needs to be published") + @Macro + private final String topic; + + @Name(NAME_FORMAT) + @Description("Format a structured record should be converted to") + @Macro + @Nullable + private final String format; + + @Name(NAME_KAFKA_PROPERTIES) + @Description("Additional kafka producer properties to set") + @Macro + @Nullable + private final String kafkaProperties; + + @Name(NAME_COMPRESSION_TYPE) + @Description("Compression type to be applied on message") + @Macro + private final String compressionType; + + @Name(NAME_CLUSTER_API_KEY) + @Description("The Confluent API Key.") + @Macro + private final String clusterApiKey; + + @Name(NAME_CLUSTER_API_SECRET) + @Description("The Confluent API Secret.") + @Macro + private final String clusterApiSecret; + + @Name(NAME_SR_URL) + @Description("The Schema Registry endpoint URL.") + @Macro + @Nullable + private final String schemaRegistryUrl; + + @Name(NAME_SR_API_KEY) + @Description("The Schema Registry API Key.") + @Macro + @Nullable + private final String schemaRegistryApiKey; + + @Name(NAME_SR_API_SECRET) + @Description("The Schema Registry API Secret.") + @Macro + @Nullable + private final String schemaRegistryApiSecret; + + public ConfluentStreamingSinkConfig( + String referenceName, + String brokers, + Boolean async, + @Nullable String timeField, + @Nullable String keyField, + @Nullable String partitionField, + String topic, + @Nullable String format, + @Nullable String kafkaProperties, + String compressionType, + String clusterApiKey, + String clusterApiSecret, + @Nullable String schemaRegistryUrl, + @Nullable String schemaRegistryApiKey, + @Nullable String schemaRegistryApiSecret + ) { + super(referenceName); + this.brokers = brokers; + this.async = async; + this.timeField = timeField; + this.keyField = keyField; + this.partitionField = partitionField; + this.topic = topic; + this.format = format; + this.kafkaProperties = kafkaProperties; + this.compressionType = compressionType; + this.clusterApiKey = clusterApiKey; + this.clusterApiSecret = clusterApiSecret; + this.schemaRegistryUrl = schemaRegistryUrl; + this.schemaRegistryApiKey = schemaRegistryApiKey; + this.schemaRegistryApiSecret = schemaRegistryApiSecret; + } + + public void validate(Schema inputSchema, FailureCollector collector) { + IdUtils.validateReferenceName(referenceName, collector); + + // brokers can be null since it is macro enabled. + if (!containsMacro(NAME_BROKERS)) { + ConfigValidations.validateBrokers(brokers, NAME_BROKERS, collector); + } + + if (!containsMacro(NAME_CLUSTER_API_KEY) && Strings.isNullOrEmpty(clusterApiKey)) { + collector.addFailure("Cluster API Key must be provided.", null) + .withConfigProperty(NAME_CLUSTER_API_KEY); + } + + if (!containsMacro(NAME_CLUSTER_API_SECRET) && Strings.isNullOrEmpty(clusterApiSecret)) { + collector.addFailure("Cluster API Secret must be provided.", null) + .withConfigProperty(NAME_CLUSTER_API_SECRET); + } + + if (!Strings.isNullOrEmpty(schemaRegistryUrl)) { + if (!Strings.isNullOrEmpty(format)) { + collector.addFailure("Message Format may not be used with Schema Registry.", null) + .withConfigProperty(NAME_SR_URL) + .withConfigProperty(NAME_FORMAT); + } + if (!containsMacro(NAME_SR_API_KEY) && Strings.isNullOrEmpty(clusterApiKey)) { + collector.addFailure("Schema Registry API Key must be provided.", null) + .withConfigProperty(NAME_SR_API_KEY); + } + if (!containsMacro(NAME_SR_API_SECRET) && Strings.isNullOrEmpty(clusterApiSecret)) { + collector.addFailure("Schema Registry API Secret must be provided.", null) + .withConfigProperty(NAME_SR_API_SECRET); + } + Schema messageSchema = getMessageSchema(inputSchema, collector); + List messageFields = messageSchema.getFields(); + if (messageFields.size() > 1) { + for (Schema.Field messageField : messageFields) { + collector.addFailure( + "Using Schema Registry, the schema must contain just a single message field.", + String.format("Remove field '%s'.", messageField.getName())) + .withInputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT); + } + } + } else if (!Strings.isNullOrEmpty(format)) { + Schema messageSchema = getMessageSchema(inputSchema, collector); + if (!SUPPORTED_FORMATS.contains(format.toLowerCase())) { + String supportedFormatsString = String.join(",", SUPPORTED_FORMATS); + collector.addFailure(String.format( + "Unsupported message format '%s'. Supported formats are: '%s'.", format, supportedFormatsString), null) + .withConfigProperty(NAME_FORMAT); + } + } else if (!containsMacro(NAME_SR_URL) && !containsMacro(NAME_FORMAT)) { + // if format is empty, there must be just a single message field of type bytes or nullable types. + Schema messageSchema = getMessageSchema(inputSchema, collector); + List messageFields = messageSchema.getFields(); + if (messageFields.size() > 1) { + for (Schema.Field messageField : messageFields) { + collector.addFailure( + "Without a format, the schema must contain just a single message field of type bytes or nullable bytes.", + String.format("Remove field '%s'.", messageField.getName())) + .withInputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT); + } + } + + Schema.Field messageField = messageFields.get(0); + Schema messageFieldSchema = messageField.getSchema().isNullable() ? messageField.getSchema().getNonNullable() : + messageField.getSchema(); + Schema.Type messageFieldType = messageFieldSchema.getType(); + if (messageFieldType != Schema.Type.BYTES || messageFieldSchema.getLogicalType() != null) { + collector.addFailure( + String.format("Without a format, the message field must be of type bytes or nullable bytes, " + + "but field '%s' is of type '%s'.", + messageField.getName(), messageField.getSchema().getDisplayName()), null) + .withInputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT); + } + } + } + + public String getBrokers() { + return brokers; + } + + public Boolean getAsync() { + return async; + } + + public String getTopic() { + return topic; + } + + public Map getKafkaProperties() { + Map conf = new HashMap<>(); + if (!Strings.isNullOrEmpty(kafkaProperties)) { + KeyValueListParser kvParser = new KeyValueListParser("\\s*,\\s*", ":"); + for (KeyValue keyVal : kvParser.parse(kafkaProperties)) { + conf.put(keyVal.getKey(), keyVal.getValue()); + } + } + return conf; + } + + + public String getCompressionType() { + return compressionType; + } + + public String getClusterApiKey() { + return clusterApiKey; + } + + public String getClusterApiSecret() { + return clusterApiSecret; + } + + @Nullable + public String getSchemaRegistryUrl() { + return schemaRegistryUrl; + } + + @Nullable + public String getSchemaRegistryApiKey() { + return schemaRegistryApiKey; + } + + @Nullable + public String getSchemaRegistryApiSecret() { + return schemaRegistryApiSecret; + } + + public String getFormat() { + return format; + } + + @Nullable + public String getTimeField() { + return timeField; + } + + @Nullable + public String getKeyField() { + return keyField; + } + + @Nullable + public String getPartitionField() { + return partitionField; + } + + // gets the message schema from the schema field. If the key or partition fields are in the configured + // schema, they will be removed. + public Schema getMessageSchema(Schema schema) { + List messageFields = schema.getFields() + .stream() + .filter(field -> !field.getName().equals(keyField) && !field.getName().equals(partitionField) + && !field.getName().equals(timeField)) + .collect(Collectors.toList()); + if (messageFields.isEmpty()) { + throw new IllegalArgumentException("Schema must contain at least one message field"); + } + return Schema.recordOf("kafka.message", messageFields); + } + + // gets the message schema from the schema field. If the key, or partition fields are in the configured + // schema, they will be removed. + public Schema getMessageSchema(Schema schema, FailureCollector collector) { + List messageFields = new ArrayList<>(); + boolean timeFieldExists = false; + boolean keyFieldExists = false; + boolean partitionFieldExists = false; + + for (Schema.Field field : schema.getFields()) { + String fieldName = field.getName(); + Schema fieldSchema = field.getSchema().isNullable() ? field.getSchema().getNonNullable() : field.getSchema(); + Schema.Type fieldType = fieldSchema.getType(); + if (fieldName.equals(timeField)) { + if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) { + collector.addFailure("The time field must be of type long or nullable long.", null) + .withConfigProperty(NAME_TIME_FIELD).withOutputSchemaField(timeField); + } + timeFieldExists = true; + } else if (fieldName.equals(keyField)) { + if ((fieldType != Schema.Type.STRING && fieldType != Schema.Type.BYTES) + || fieldSchema.getLogicalType() != null) { + collector.addFailure("The key field must be of type bytes, nullable bytes, string, nullable string.", null) + .withConfigProperty(NAME_KEY_FIELD) + .withInputSchemaField(keyField); + } + keyFieldExists = true; + } else if (fieldName.equals(partitionField)) { + if (fieldType != Schema.Type.INT || fieldSchema.getLogicalType() != null) { + collector.addFailure("The partition field must be of type int.", null) + .withConfigProperty(NAME_PARTITION_FIELD) + .withInputSchemaField(partitionField); + } + partitionFieldExists = true; + } else { + messageFields.add(field); + } + } + + if (!Strings.isNullOrEmpty(timeField) && !timeFieldExists) { + collector.addFailure(String.format("Time field '%s' must exist in schema.", timeField), null) + .withConfigProperty(NAME_TIME_FIELD); + } + if (!Strings.isNullOrEmpty(keyField) && !keyFieldExists) { + collector.addFailure(String.format("Key field '%s' must exist in schema.", keyField), null) + .withConfigProperty(NAME_KEY_FIELD); + } + if (!Strings.isNullOrEmpty(partitionField) && !partitionFieldExists) { + collector.addFailure(String.format("Partition field '%s' must exist in schema.", partitionField), null) + .withConfigProperty(NAME_PARTITION_FIELD); + } + + if (messageFields.isEmpty()) { + collector.addFailure("Schema must contain at least one message field.", null); + throw collector.getOrThrowException(); + } + return Schema.recordOf("kafka.message", messageFields); + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSinkUtil.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSinkUtil.java new file mode 100644 index 0000000..06d1148 --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/ConfluentStreamingSinkUtil.java @@ -0,0 +1,77 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.sink; + +import com.google.common.base.Strings; +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.kafka.clients.CommonClientConfigs; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.config.SaslConfigs; +import org.apache.kafka.common.config.SslConfigs; +import org.apache.kafka.common.serialization.ByteArraySerializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; +import javax.annotation.Nonnull; + +/** + * Util method for {@link ConfluentStreamingSink}. + * + * This class contains methods for {@link ConfluentStreamingSink} that require spark classes because during validation + * spark classes are not available. Refer CDAP-15912 for more information. + */ +final class ConfluentStreamingSinkUtil { + private static final Logger LOG = LoggerFactory.getLogger(ConfluentStreamingSinkUtil.class); + + @Nonnull + public static Map getProducerParams(ConfluentStreamingSinkConfig conf, String pipelineName) { + Map kafkaParams = new HashMap<>(); + kafkaParams.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, conf.getBrokers()); + // Spark saves the offsets in checkpoints, no need for Kafka to save them + kafkaParams.put(SslConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG, "https"); + kafkaParams.put(CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG, "500"); + kafkaParams.put(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, "SASL_SSL"); + kafkaParams.put(SaslConfigs.SASL_MECHANISM, "PLAIN"); + kafkaParams.put(SaslConfigs.SASL_JAAS_CONFIG, "org.apache.kafka.common.security.plain.PlainLoginModule required " + + "username=" + conf.getClusterApiKey() + " password=" + conf.getClusterApiSecret() + ";"); + + if (!Strings.isNullOrEmpty(conf.getSchemaRegistryUrl())) { + kafkaParams.put("schema.registry.url", conf.getSchemaRegistryUrl()); + kafkaParams.put("basic.auth.credentials.source", "USER_INFO"); + kafkaParams.put("schema.registry.basic.auth.user.info", + conf.getSchemaRegistryApiKey() + ":" + conf.getSchemaRegistryApiSecret()); + kafkaParams.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, KafkaAvroSerializer.class.getCanonicalName()); + kafkaParams.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaAvroSerializer.class.getCanonicalName()); + } else { + kafkaParams.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class.getCanonicalName()); + kafkaParams.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class.getCanonicalName()); + } + kafkaParams.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, conf.getCompressionType()); + if (conf.getAsync()) { + kafkaParams.put(ProducerConfig.ACKS_CONFIG, "1"); + } + kafkaParams.put(CommonClientConfigs.REQUEST_TIMEOUT_MS_CONFIG, "20000"); + kafkaParams.putAll(conf.getKafkaProperties()); + return kafkaParams; + } + + private ConfluentStreamingSinkUtil() { + // no-op + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/StructuredToProducerRecordTransformer.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/StructuredToProducerRecordTransformer.java new file mode 100644 index 0000000..ec1f854 --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/sink/StructuredToProducerRecordTransformer.java @@ -0,0 +1,150 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.sink; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.format.StructuredRecordStringConverter; +import io.cdap.plugin.format.avro.StructuredToAvroTransformer; +import org.apache.commons.lang3.StringUtils; +import org.apache.kafka.clients.producer.ProducerRecord; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.List; + +/** + * Create ProducerRecords from StructuredRecords. + */ +public class StructuredToProducerRecordTransformer implements Serializable { + + private static final long serialVersionUID = 1025599828133261290L; + + private final ConfluentStreamingSinkConfig conf; + private final ConverterFunction keyExtractor; + private final ConverterFunction valueExtractor; + private final ConverterFunction partitionExtractor; + private final ConverterFunction timeExtractor; + + public StructuredToProducerRecordTransformer(ConfluentStreamingSinkConfig conf, Schema outputSchema) { + this.conf = conf; + keyExtractor = createKeyExtractor(); + valueExtractor = createValueExtractor(outputSchema); + partitionExtractor = createPartitionExtractor(); + timeExtractor = createTimeExtractor(); + } + + public ProducerRecord transform(StructuredRecord record) throws IOException { + Object key = keyExtractor.apply(record); + Object body = valueExtractor.apply(record); + Integer partition = partitionExtractor.apply(record); + Long time = timeExtractor.apply(record); + return new ProducerRecord<>(conf.getTopic(), partition, time, key, body); + } + + private ConverterFunction createKeyExtractor() { + if (Strings.isNullOrEmpty(conf.getKeyField())) { + return record -> null; + } + return record -> { + Object key = record.get(conf.getKeyField()); + if (conf.getSchemaRegistryUrl() != null) { + return key; + } + if (key == null) { + return null; + } + if (key instanceof String) { + return getUtf8Bytes((String) key); + } + if (key instanceof byte[]) { + return (byte[]) key; + } + if (key instanceof ByteBuffer) { + return ((ByteBuffer) key).array(); + } + return getUtf8Bytes(String.valueOf(key)); + }; + } + + private ConverterFunction createPartitionExtractor() { + if (Strings.isNullOrEmpty(conf.getPartitionField())) { + return record -> null; + } + return record -> record.get(conf.getPartitionField()); + } + + private ConverterFunction createTimeExtractor() { + if (Strings.isNullOrEmpty(conf.getTimeField())) { + return record -> null; + } + return record -> record.get(conf.getTimeField()); + } + + private ConverterFunction createValueExtractor(Schema outputSchema) { + if (conf.getSchemaRegistryUrl() != null) { + Schema.Field messageField = outputSchema.getFields().get(0); + return input -> { + StructuredRecord messageRecord = input.get(messageField.getName()); + StructuredToAvroTransformer transformer = new StructuredToAvroTransformer(messageRecord.getSchema()); + return transformer.transform(messageRecord); + }; + } + if (Strings.isNullOrEmpty(conf.getFormat())) { + Schema.Field messageField = outputSchema.getFields().get(0); + return input -> input.get(messageField.getName()); + } + if ("json".equalsIgnoreCase(conf.getFormat())) { + return input -> { + StructuredRecord.Builder recordBuilder = StructuredRecord.builder(outputSchema); + for (Schema.Field field : outputSchema.getFields()) { + recordBuilder.set(field.getName(), input.get(field.getName())); + } + StructuredRecord outputRecord = recordBuilder.build(); + return getUtf8Bytes(StructuredRecordStringConverter.toJsonString(outputRecord)); + }; + } + if ("csv".equalsIgnoreCase(conf.getFormat())) { + return input -> { + List objs = getExtractedValues(input, outputSchema.getFields()); + return getUtf8Bytes(StringUtils.join(objs, ",")); + }; + } + throw new IllegalStateException(String.format("Unsupported message format '%s'", conf.getFormat())); + } + + private byte[] getUtf8Bytes(String text) { + return text.getBytes(StandardCharsets.UTF_8); + } + + private List getExtractedValues(StructuredRecord input, List fields) { + // Extract all values from the structured record + List objs = Lists.newArrayList(); + for (Schema.Field field : fields) { + objs.add(input.get(field.getName())); + } + return objs; + } + + private interface ConverterFunction extends Serializable { + R apply(T t) throws IOException; + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSource.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSource.java new file mode 100644 index 0000000..78de4cc --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSource.java @@ -0,0 +1,141 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.source; + +import io.cdap.cdap.api.annotation.Description; +import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.annotation.Plugin; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.api.dataset.DatasetProperties; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.cdap.etl.api.PipelineConfigurer; +import io.cdap.cdap.etl.api.StageConfigurer; +import io.cdap.cdap.etl.api.streaming.StreamingContext; +import io.cdap.cdap.etl.api.streaming.StreamingSource; +import io.cdap.plugin.common.Constants; +import io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient; +import io.confluent.kafka.schemaregistry.client.SchemaMetadata; +import io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException; +import org.apache.spark.streaming.api.java.JavaDStream; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Confluent Kafka Streaming source. + */ +@Plugin(type = StreamingSource.PLUGIN_TYPE) +@Name(ConfluentStreamingSource.PLUGIN_NAME) +@Description("Confluent Kafka streaming source.") +public class ConfluentStreamingSource extends StreamingSource { + public static final String PLUGIN_NAME = "Confluent"; + + private final ConfluentStreamingSourceConfig conf; + + public ConfluentStreamingSource(ConfluentStreamingSourceConfig conf) { + this.conf = conf; + } + + @Override + public void configurePipeline(PipelineConfigurer pipelineConfigurer) { + StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer(); + FailureCollector collector = stageConfigurer.getFailureCollector(); + + conf.validate(collector); + Schema schema = getOutputSchema(collector); + stageConfigurer.setOutputSchema(schema); + + if (conf.getMaxRatePerPartition() != null && conf.getMaxRatePerPartition() > 0) { + Map pipelineProperties = new HashMap<>(); + pipelineProperties.put("spark.streaming.kafka.maxRatePerPartition", conf.getMaxRatePerPartition().toString()); + pipelineConfigurer.setPipelineProperties(pipelineProperties); + } + pipelineConfigurer.createDataset(conf.referenceName, Constants.EXTERNAL_DATASET_TYPE, DatasetProperties.EMPTY); + } + + @Override + public JavaDStream getStream(StreamingContext context) throws Exception { + FailureCollector collector = context.getFailureCollector(); + conf.validate(collector); + Schema outputSchema = getOutputSchema(collector); + collector.getOrThrowException(); + + context.registerLineage(conf.referenceName); + return ConfluentStreamingSourceUtil.getStructuredRecordJavaDStream(context, conf, outputSchema, collector); + } + + private Schema getOutputSchema(FailureCollector failureCollector) { + if (conf.getSchemaRegistryUrl() == null) { + return conf.getSchema(failureCollector); + } + return inferSchema(failureCollector); + } + + private Schema inferSchema(FailureCollector failureCollector) { + try { + Map options = new HashMap<>(); + options.put("basic.auth.credentials.source", "USER_INFO"); + options.put("basic.auth.user.info", conf.getSchemaRegistryApiKey() + ':' + conf.getSchemaRegistryApiSecret()); + CachedSchemaRegistryClient schemaRegistryClient = + new CachedSchemaRegistryClient(conf.getSchemaRegistryUrl(), 2, options); + Schema initialSchema = conf.getSchema(failureCollector); + List newFields = new ArrayList<>(); + boolean keySchemaShouldBeAdded = conf.getKeyField() != null; + boolean messageSchemaShouldBeAdded = conf.getValueField() != null; + for (Schema.Field field : initialSchema.getFields()) { + if (field.getName().equals(conf.getKeyField())) { + Schema keySchema = fetchSchema(schemaRegistryClient, conf.getTopic() + "-key"); + newFields.add(Schema.Field.of(field.getName(), keySchema)); + keySchemaShouldBeAdded = false; + } else if (field.getName().equals(conf.getValueField())) { + Schema valueSchema = fetchSchema(schemaRegistryClient, conf.getTopic() + "-value"); + newFields.add(Schema.Field.of(field.getName(), valueSchema)); + messageSchemaShouldBeAdded = false; + } else { + newFields.add(field); + } + } + if (keySchemaShouldBeAdded) { + Schema keySchema = fetchSchema(schemaRegistryClient, conf.getTopic() + "-key"); + newFields.add(Schema.Field.of(conf.getKeyField(), keySchema)); + } + if (messageSchemaShouldBeAdded) { + Schema valueSchema = fetchSchema(schemaRegistryClient, conf.getTopic() + "-value"); + newFields.add(Schema.Field.of(conf.getValueField(), valueSchema)); + } + return Schema.recordOf(initialSchema.getRecordName(), newFields); + } catch (IOException | RestClientException e) { + failureCollector.addFailure("Failed to infer output schema. Reason: " + e.getMessage(), null) + .withStacktrace(e.getStackTrace()); + throw failureCollector.getOrThrowException(); + } + } + + private Schema fetchSchema(CachedSchemaRegistryClient schemaRegistryClient, String subject) + throws IOException, RestClientException { + SchemaMetadata schemaMetadata = schemaRegistryClient.getLatestSchemaMetadata(subject); + if (schemaMetadata.getSchema().startsWith("\"")) { + String typeName = schemaMetadata.getSchema().substring(1, schemaMetadata.getSchema().length() - 1); + return Schema.of(Schema.Type.valueOf(typeName.toUpperCase())); + } + return Schema.parseJson(schemaMetadata.getSchema()); + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSourceConfig.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSourceConfig.java new file mode 100644 index 0000000..19baa86 --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSourceConfig.java @@ -0,0 +1,587 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.source; + +import com.google.common.base.Splitter; +import com.google.common.base.Strings; +import io.cdap.cdap.api.annotation.Description; +import io.cdap.cdap.api.annotation.Macro; +import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.data.format.FormatSpecification; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.api.dataset.lib.KeyValue; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.cdap.format.RecordFormats; +import io.cdap.plugin.common.IdUtils; +import io.cdap.plugin.common.KeyValueListParser; +import io.cdap.plugin.common.ReferencePluginConfig; +import io.cdap.plugin.confluent.common.ConfigValidations; +import org.apache.kafka.common.TopicPartition; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import javax.annotation.Nullable; + +/** + * Conf for Confluent Kafka streaming source. + */ +@SuppressWarnings("unused") +public class ConfluentStreamingSourceConfig extends ReferencePluginConfig implements Serializable { + public static final String NAME_SCHEMA = "schema"; + public static final String NAME_BROKERS = "brokers"; + public static final String NAME_TOPIC = "topic"; + public static final String NAME_PARTITIONS = "partitions"; + public static final String NAME_MAX_RATE = "maxRatePerPartition"; + public static final String NAME_INITIAL_PARTITION_OFFSETS = "initialPartitionOffsets"; + public static final String NAME_DEFAULT_INITIAL_OFFSET = "defaultInitialOffset"; + public static final String NAME_TIMEFIELD = "timeField"; + public static final String NAME_KEYFIELD = "keyField"; + public static final String NAME_PARTITION_FIELD = "partitionField"; + public static final String NAME_OFFSET_FIELD = "offsetField"; + public static final String NAME_CLUSTER_API_KEY = "clusterApiKey"; + public static final String NAME_CLUSTER_API_SECRET = "clusterApiSecret"; + public static final String NAME_SR_URL = "schemaRegistryUrl"; + public static final String NAME_SR_API_KEY = "schemaRegistryApiKey"; + public static final String NAME_SR_API_SECRET = "schemaRegistryApiSecret"; + public static final String NAME_VALUE_FIELD = "valueField"; + public static final String NAME_FORMAT = "format"; + public static final String NAME_KAFKA_PROPERTIES = "kafkaProperties"; + + private static final String SEPARATOR = ":"; + + private static final long serialVersionUID = 8069169417140954175L; + + @Name(NAME_BROKERS) + @Description("List of Kafka brokers specified in host1:port1,host2:port2 form. For example, " + + "host1.example.com:9092,host2.example.com:9092.") + @Macro + private final String brokers; + + @Name(NAME_TOPIC) + @Description("Kafka topic to read from.") + @Macro + private final String topic; + + @Name(NAME_PARTITIONS) + @Description("The topic partitions to read from. If not specified, all partitions will be read.") + @Nullable + @Macro + private final String partitions; + + @Name(NAME_INITIAL_PARTITION_OFFSETS) + @Description("The initial offset for each topic partition. If this is not specified, " + + "all partitions will have the same initial offset, which is determined by the defaultInitialOffset property. " + + "An offset of -2 means the smallest offset. An offset of -1 means the latest offset. " + + "Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read.") + @Nullable + @Macro + private final String initialPartitionOffsets; + + @Name(NAME_DEFAULT_INITIAL_OFFSET) + @Description("The default initial offset for all topic partitions. " + + "An offset of -2 means the smallest offset. An offset of -1 means the latest offset. Defaults to -1. " + + "Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read. " + + "If you wish to set different initial offsets for different partitions, use the Initial Partition Offsets" + + " property.") + @Nullable + @Macro + private final Long defaultInitialOffset; + + @Name(NAME_SCHEMA) + @Description("Output schema of the source, including the timeField and keyField. " + + "The fields excluding the timeField and keyField are used in conjunction with the format " + + "to parse Kafka payloads.") + private final String schema; + + @Name(NAME_FORMAT) + @Description("Optional format of the Kafka event. Any format supported by CDAP is supported. " + + "For example, a value of 'csv' will attempt to parse Kafka payloads as comma-separated values. " + + "If no format is given, Kafka message payloads will be treated as bytes.") + @Nullable + private final String format; + + @Name(NAME_TIMEFIELD) + @Description("Optional name of the field containing the read time of the batch. " + + "If this is not set, no time field will be added to output records. " + + "If set, this field must be present in the schema property and must be a long.") + @Nullable + private final String timeField; + + @Name(NAME_KEYFIELD) + @Description("Optional name of the field containing the message key. " + + "If this is not set, no key field will be added to output records. " + + "If set, this field must be present in the schema property and must be bytes.") + @Nullable + private final String keyField; + + @Name(NAME_PARTITION_FIELD) + @Description("Optional name of the field containing the kafka partition that was read from. " + + "If this is not set, no partition field will be added to output records. " + + "If set, this field must be present in the schema property and must be an integer.") + @Nullable + private final String partitionField; + + @Name(NAME_OFFSET_FIELD) + @Description("Optional name of the field containing the kafka offset that the message was read from. " + + "If this is not set, no offset field will be added to output records. " + + "If set, this field must be present in the schema property and must be a long.") + @Nullable + private final String offsetField; + + @Name(NAME_MAX_RATE) + @Description("Max number of records to read per second per partition. 0 means there is no limit. Defaults to 1000.") + @Nullable + private final Integer maxRatePerPartition; + + @Name(NAME_KAFKA_PROPERTIES) + @Description("Additional kafka consumer properties to set.") + @Macro + @Nullable + private final String kafkaProperties; + + @Name(NAME_CLUSTER_API_KEY) + @Description("The Confluent API Key.") + @Macro + private final String clusterApiKey; + + @Name(NAME_CLUSTER_API_SECRET) + @Description("The Confluent API Secret.") + @Macro + private final String clusterApiSecret; + + @Name(NAME_SR_URL) + @Description("The Schema Registry endpoint URL.") + @Macro + @Nullable + private final String schemaRegistryUrl; + + @Name(NAME_SR_API_KEY) + @Description("The Schema Registry API Key.") + @Macro + @Nullable + private final String schemaRegistryApiKey; + + @Name(NAME_SR_API_SECRET) + @Description("The Schema Registry API Secret.") + @Macro + @Nullable + private final String schemaRegistryApiSecret; + + @Name(NAME_VALUE_FIELD) + @Description("Name of the field containing the message payload. Required when Schema Registry is used." + + "This field will be used to infer schema from Schema Registry.") + @Macro + @Nullable + private final String valueField; + + public ConfluentStreamingSourceConfig( + String referenceName, + String brokers, + String topic, + @Nullable String partitions, + @Nullable String initialPartitionOffsets, + @Nullable Long defaultInitialOffset, + String schema, + @Nullable String format, + @Nullable String timeField, + @Nullable String keyField, + @Nullable String partitionField, + @Nullable String offsetField, + @Nullable Integer maxRatePerPartition, + @Nullable String kafkaProperties, + String clusterApiKey, + String clusterApiSecret, + @Nullable String schemaRegistryUrl, + @Nullable String schemaRegistryApiKey, + @Nullable String schemaRegistryApiSecret, + @Nullable String valueField) { + super(referenceName); + this.brokers = brokers; + this.topic = topic; + this.partitions = partitions; + this.initialPartitionOffsets = initialPartitionOffsets; + this.defaultInitialOffset = defaultInitialOffset; + this.schema = schema; + this.format = format; + this.timeField = timeField; + this.keyField = keyField; + this.partitionField = partitionField; + this.offsetField = offsetField; + this.maxRatePerPartition = maxRatePerPartition; + this.kafkaProperties = kafkaProperties; + this.clusterApiKey = clusterApiKey; + this.clusterApiSecret = clusterApiSecret; + this.schemaRegistryUrl = schemaRegistryUrl; + this.schemaRegistryApiKey = schemaRegistryApiKey; + this.schemaRegistryApiSecret = schemaRegistryApiSecret; + this.valueField = valueField; + } + + public String getTopic() { + return topic; + } + + public String getBrokers() { + return brokers; + } + + @Nullable + public String getTimeField() { + return getNullableProperty(timeField); + } + + @Nullable + public String getKeyField() { + return getNullableProperty(keyField); + } + + @Nullable + public String getPartitionField() { + return getNullableProperty(partitionField); + } + + @Nullable + public String getOffsetField() { + return getNullableProperty(offsetField); + } + + @Nullable + public String getFormat() { + return getNullableProperty(format); + } + + @Nullable + public Integer getMaxRatePerPartition() { + return maxRatePerPartition; + } + + @Nullable + public Schema getSchema() { + try { + return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema); + } catch (IOException e) { + throw new IllegalArgumentException("Invalid schema : " + e.getMessage()); + } + } + + @Nullable + public Schema getSchema(FailureCollector collector) { + try { + return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema); + } catch (IOException e) { + collector.addFailure("Invalid schema : " + e.getMessage(), null).withConfigProperty(NAME_SCHEMA); + } + throw collector.getOrThrowException(); + } + + // gets the message schema from the schema field. If the time, key, partition, or offset fields are in the configured + // schema, they will be removed. + public Schema getMessageSchema() { + Schema schema = getSchema(); + List messageFields = schema.getFields() + .stream() + .filter(field -> { + String fieldName = field.getName(); + return !fieldName.equals(timeField) && !fieldName.equals(keyField) && !fieldName.equals(partitionField) + && !fieldName.equals(offsetField); + }) + .collect(Collectors.toList()); + if (messageFields.isEmpty()) { + throw new IllegalArgumentException("Schema must contain at least one message field"); + } + return Schema.recordOf("kafka.message", messageFields); + } + + // gets the message schema from the schema field. If the time, key, partition, or offset fields are in the configured + // schema, they will be removed. + public Schema getMessageSchema(FailureCollector collector) { + Schema schema = getSchema(collector); + List messageFields = new ArrayList<>(); + boolean timeFieldExists = false; + boolean keyFieldExists = false; + boolean partitionFieldExists = false; + boolean offsetFieldExists = false; + + for (Schema.Field field : schema.getFields()) { + String fieldName = field.getName(); + Schema fieldSchema = field.getSchema().isNullable() ? field.getSchema().getNonNullable() : field.getSchema(); + Schema.Type fieldType = fieldSchema.getType(); + // if the field is not the time field and not the key field, it is a message field. + if (fieldName.equals(timeField)) { + if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) { + collector.addFailure("The time field must be of type long or nullable long.", null) + .withConfigProperty(NAME_TIMEFIELD).withOutputSchemaField(timeField); + } + timeFieldExists = true; + } else if (fieldName.equals(keyField)) { + if (getSchemaRegistryUrl() == null && ((fieldType != Schema.Type.STRING && fieldType != Schema.Type.BYTES) + || fieldSchema.getLogicalType() != null)) { + collector.addFailure("The key field must be of type bytes, nullable bytes, string, nullable string.", null) + .withConfigProperty(NAME_KEYFIELD).withOutputSchemaField(keyField); + } + keyFieldExists = true; + } else if (fieldName.equals(partitionField)) { + if (fieldType != Schema.Type.INT || fieldSchema.getLogicalType() != null) { + collector.addFailure("The partition field must be of type int.", null) + .withConfigProperty(NAME_PARTITION_FIELD).withOutputSchemaField(partitionField); + } + partitionFieldExists = true; + } else if (fieldName.equals(offsetField)) { + if (fieldType != Schema.Type.LONG || fieldSchema.getLogicalType() != null) { + collector.addFailure("The offset field must be of type long.", null) + .withConfigProperty(NAME_OFFSET_FIELD).withOutputSchemaField(offsetField); + } + offsetFieldExists = true; + } else { + messageFields.add(field); + } + } + + if (getTimeField() != null && !timeFieldExists) { + collector.addFailure(String.format("Time field '%s' must exist in schema.", timeField), null) + .withConfigProperty(NAME_TIMEFIELD); + } + if (getKeyField() != null && !keyFieldExists) { + collector.addFailure(String.format("Key field '%s' must exist in schema.", keyField), null) + .withConfigProperty(NAME_KEYFIELD); + } + if (getPartitionField() != null && !partitionFieldExists) { + collector.addFailure(String.format("Partition field '%s' must exist in schema.", partitionField), null) + .withConfigProperty(NAME_PARTITION_FIELD); + } + if (getOffsetField() != null && !offsetFieldExists) { + collector.addFailure(String.format("Offset field '%s' must exist in schema.", offsetField), null) + .withConfigProperty(NAME_OFFSET_FIELD); + } + + if (messageFields.isEmpty()) { + collector.addFailure("Schema must contain at least one message field.", null); + throw collector.getOrThrowException(); + } + return Schema.recordOf("kafka.message", messageFields); + } + + /** + * Get the initial partition offsets for the specified partitions. If an initial offset is specified in the + * initialPartitionOffsets property, that value will be used. Otherwise, the defaultInitialOffset will be used. + * + * @param partitionsToRead the partitions to read + * @param collector failure collector + * @return initial partition offsets. + */ + public Map getInitialPartitionOffsets(Set partitionsToRead, + FailureCollector collector) { + Map partitionOffsets = new HashMap<>(); + + // set default initial partitions + for (Integer partition : partitionsToRead) { + partitionOffsets.put(new TopicPartition(topic, partition), defaultInitialOffset); + } + + // if initial partition offsets are specified, overwrite the defaults. + if (initialPartitionOffsets != null) { + for (KeyValue partitionAndOffset : KeyValueListParser.DEFAULT.parse(initialPartitionOffsets)) { + String partitionStr = partitionAndOffset.getKey(); + String offsetStr = partitionAndOffset.getValue(); + int partition; + try { + partition = Integer.parseInt(partitionStr); + } catch (NumberFormatException e) { + collector.addFailure( + String.format("Invalid partition '%s' in initialPartitionOffsets.", partitionStr), + "Partition must be a valid integer.") + .withConfigElement(NAME_INITIAL_PARTITION_OFFSETS, partitionStr + SEPARATOR + offsetStr); + continue; + } + long offset; + try { + offset = Long.parseLong(offsetStr); + } catch (NumberFormatException e) { + collector.addFailure( + String.format("Invalid offset '%s' in initialPartitionOffsets for partition %d.", offsetStr, partition), + "Offset muse be a valid integer.") + .withConfigElement(NAME_INITIAL_PARTITION_OFFSETS, partitionStr + SEPARATOR + offsetStr); + continue; + } + partitionOffsets.put(new TopicPartition(topic, partition), offset); + } + } + + return partitionOffsets; + } + + /** + * @return set of partitions to read from. Returns an empty list if no partitions were specified. + */ + public Set getPartitions(FailureCollector collector) { + Set partitionSet = new HashSet<>(); + if (Strings.isNullOrEmpty(partitions)) { + return partitionSet; + } + for (String partition : Splitter.on(',').trimResults().split(partitions)) { + try { + partitionSet.add(Integer.parseInt(partition)); + } catch (NumberFormatException e) { + collector.addFailure(String.format("Invalid partition '%s'.", partition), "Partitions must be integers.") + .withConfigElement(NAME_PARTITIONS, partition); + } + } + return partitionSet; + } + + public String getClusterApiKey() { + return clusterApiKey; + } + + public String getClusterApiSecret() { + return clusterApiSecret; + } + + @Nullable + public String getSchemaRegistryUrl() { + return getNullableProperty(schemaRegistryUrl); + } + + @Nullable + public String getSchemaRegistryApiKey() { + return getNullableProperty(schemaRegistryApiKey); + } + + @Nullable + public String getSchemaRegistryApiSecret() { + return getNullableProperty(schemaRegistryApiSecret); + } + + @Nullable + public String getValueField() { + return getNullableProperty(valueField); + } + + @Nullable + private String getNullableProperty(String property) { + return Strings.isNullOrEmpty(property) ? null : property; + } + + public Map getKafkaProperties() { + Map conf = new HashMap<>(); + if (!Strings.isNullOrEmpty(kafkaProperties)) { + KeyValueListParser kvParser = new KeyValueListParser("\\s*,\\s*", ":"); + for (KeyValue keyVal : kvParser.parse(kafkaProperties)) { + conf.put(keyVal.getKey(), keyVal.getValue()); + } + } + return conf; + } + + public void validate(FailureCollector collector) { + IdUtils.validateReferenceName(referenceName, collector); + // brokers can be null since it is macro enabled. + if (!containsMacro(NAME_BROKERS)) { + ConfigValidations.validateBrokers(brokers, NAME_BROKERS, collector); + } + Set partitions = getPartitions(collector); + getInitialPartitionOffsets(partitions, collector); + + if (maxRatePerPartition == null) { + collector.addFailure("Max rate per partition must be provided.", null) + .withConfigProperty(NAME_MAX_RATE); + } else if (maxRatePerPartition < 0) { + collector.addFailure(String.format("Invalid maxRatePerPartition '%d'.", maxRatePerPartition), + "Rate must be 0 or greater.") + .withConfigProperty(NAME_MAX_RATE); + } + + if (!Strings.isNullOrEmpty(timeField) && !Strings.isNullOrEmpty(keyField) && timeField.equals(keyField)) { + collector.addFailure(String.format( + "The timeField and keyField cannot both have the same name (%s).", timeField), null) + .withConfigProperty(NAME_TIMEFIELD).withConfigProperty(NAME_KEYFIELD); + } + + if (!containsMacro(NAME_CLUSTER_API_KEY) && Strings.isNullOrEmpty(clusterApiKey)) { + collector.addFailure("Cluster API Key must be provided.", null) + .withConfigProperty(NAME_CLUSTER_API_KEY); + } + + if (!containsMacro(NAME_CLUSTER_API_SECRET) && Strings.isNullOrEmpty(clusterApiSecret)) { + collector.addFailure("Cluster API Secret must be provided.", null) + .withConfigProperty(NAME_CLUSTER_API_SECRET); + } + + if (!Strings.isNullOrEmpty(schemaRegistryUrl)) { + if (!Strings.isNullOrEmpty(format)) { + collector.addFailure("Message Format may not be used with Schema Registry.", null) + .withConfigProperty(NAME_SR_URL) + .withConfigProperty(NAME_FORMAT); + } + if (!containsMacro(NAME_SR_API_KEY) && Strings.isNullOrEmpty(clusterApiKey)) { + collector.addFailure("Schema Registry API Key must be provided.", null) + .withConfigProperty(NAME_SR_API_KEY); + } + if (!containsMacro(NAME_SR_API_SECRET) && Strings.isNullOrEmpty(clusterApiSecret)) { + collector.addFailure("Schema Registry API Secret must be provided.", null) + .withConfigProperty(NAME_SR_API_SECRET); + } + if (!containsMacro(NAME_VALUE_FIELD) && Strings.isNullOrEmpty(valueField)) { + collector.addFailure("Message Field should be provided when Schema Registry is used.", null) + .withConfigProperty(NAME_VALUE_FIELD); + } + } else if (!Strings.isNullOrEmpty(format)) { + // it is a format, make sure we can instantiate it. + Schema messageSchema = getMessageSchema(collector); + FormatSpecification formatSpec = new FormatSpecification(format, messageSchema, new HashMap<>()); + try { + RecordFormats.createInitializedFormat(formatSpec); + } catch (Exception e) { + collector.addFailure(String.format( + "Unable to instantiate a message parser from format '%s': %s", + format, e.getMessage()), null).withStacktrace(e.getStackTrace()).withConfigProperty(NAME_FORMAT); + } + } else if (!containsMacro(NAME_SR_URL) && !containsMacro(NAME_FORMAT)) { + // if format is empty, there must be just a single message field of type bytes or nullable types. + Schema messageSchema = getMessageSchema(collector); + List messageFields = messageSchema.getFields(); + if (messageFields.size() > 1) { + for (Schema.Field messageField : messageFields) { + collector.addFailure( + "Without a format, the schema must contain just a single message field of type bytes or nullable bytes.", + String.format("Remove field '%s'.", messageField.getName())) + .withOutputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT); + } + return; + } + + Schema.Field messageField = messageFields.get(0); + Schema messageFieldSchema = messageField.getSchema().isNullable() ? messageField.getSchema().getNonNullable() : + messageField.getSchema(); + Schema.Type messageFieldType = messageFieldSchema.getType(); + if (messageFieldType != Schema.Type.BYTES || messageFieldSchema.getLogicalType() != null) { + collector.addFailure( + String.format("Without a format, the message field must be of type bytes or nullable bytes, " + + "but field '%s' is of type '%s'.", + messageField.getName(), messageField.getSchema().getDisplayName()), null) + .withOutputSchemaField(messageField.getName()).withConfigProperty(NAME_FORMAT); + } + } + } +} diff --git a/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSourceUtil.java b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSourceUtil.java new file mode 100644 index 0000000..bec9944 --- /dev/null +++ b/confluent-kafka-plugins/src/main/java/io/cdap/plugin/confluent/streaming/source/ConfluentStreamingSourceUtil.java @@ -0,0 +1,417 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.streaming.source; + +import com.google.common.base.Joiner; +import com.google.common.base.Strings; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import io.cdap.cdap.api.data.format.FormatSpecification; +import io.cdap.cdap.api.data.format.RecordFormat; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.format.UnexpectedFormatException; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.cdap.etl.api.streaming.StreamingContext; +import io.cdap.cdap.format.RecordFormats; +import io.cdap.plugin.confluent.common.KafkaHelpers; +import io.cdap.plugin.format.avro.AvroToStructuredTransformer; +import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.CommonClientConfigs; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.PartitionInfo; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.config.SaslConfigs; +import org.apache.kafka.common.config.SslConfigs; +import org.apache.kafka.common.requests.ListOffsetRequest; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.streaming.Time; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.apache.spark.streaming.kafka010.ConsumerStrategies; +import org.apache.spark.streaming.kafka010.KafkaUtils; +import org.apache.spark.streaming.kafka010.LocationStrategies; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import javax.annotation.Nonnull; + +/** + * Util method for {@link ConfluentStreamingSource}. + *

+ * This class contains methods for {@link ConfluentStreamingSource} that require spark classes because during validation + * spark classes are not available. Refer CDAP-15912 for more information. + */ +final class ConfluentStreamingSourceUtil { + private static final Logger LOG = LoggerFactory.getLogger(ConfluentStreamingSourceUtil.class); + + private ConfluentStreamingSourceUtil() { + // no-op + } + + /** + * Returns {@link JavaDStream} for {@link ConfluentStreamingSource}. + * @param context streaming context + * @param conf kafka conf + * @param outputSchema source output schema + * @param collector failure collector + */ + static JavaDStream getStructuredRecordJavaDStream( + StreamingContext context, ConfluentStreamingSourceConfig conf, Schema outputSchema, FailureCollector collector) { + String pipelineName = context.getPipelineName(); + Map kafkaParams = getConsumerParams(conf, pipelineName); + Properties properties = new Properties(); + properties.putAll(kafkaParams); + try (Consumer consumer = new KafkaConsumer<>(properties, new ByteArrayDeserializer(), + new ByteArrayDeserializer())) { + Map offsets = getOffsets(conf, collector, consumer); + LOG.info("Using initial offsets {}", offsets); + + if (conf.getSchemaRegistryUrl() != null) { + AvroRecordTransform transform = new AvroRecordTransform(conf, outputSchema); + return createKafkaDirectStream(context, conf, kafkaParams, offsets, transform); + } + return createKafkaDirectStream(context, conf, kafkaParams, offsets, new RecordTransform(conf, outputSchema)); + } + } + + private static JavaDStream createKafkaDirectStream( + StreamingContext context, + ConfluentStreamingSourceConfig conf, + Map kafkaParams, + Map offsets, + Function2>, Time, JavaRDD> transform + ) { + return KafkaUtils.createDirectStream( + context.getSparkStreamingContext(), LocationStrategies.PreferConsistent(), + ConsumerStrategies.Subscribe(Collections.singleton(conf.getTopic()), kafkaParams, offsets) + ).transform(transform); + } + + @Nonnull + private static Map getConsumerParams(ConfluentStreamingSourceConfig conf, String pipelineName) { + Map kafkaParams = new HashMap<>(); + kafkaParams.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, conf.getBrokers()); + // Spark saves the offsets in checkpoints, no need for Kafka to save them + kafkaParams.put(SslConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG, "https"); + kafkaParams.put(CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG, "500"); + kafkaParams.put(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, "SASL_SSL"); + kafkaParams.put(SaslConfigs.SASL_MECHANISM, "PLAIN"); + kafkaParams.put(SaslConfigs.SASL_JAAS_CONFIG, "org.apache.kafka.common.security.plain.PlainLoginModule required " + + "username=" + conf.getClusterApiKey() + " password=" + conf.getClusterApiSecret() + ";"); + + if (!Strings.isNullOrEmpty(conf.getSchemaRegistryUrl())) { + kafkaParams.put("schema.registry.url", conf.getSchemaRegistryUrl()); + kafkaParams.put("basic.auth.credentials.source", "USER_INFO"); + kafkaParams.put("schema.registry.basic.auth.user.info", + conf.getSchemaRegistryApiKey() + ":" + conf.getSchemaRegistryApiSecret()); + kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getCanonicalName()); + kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getCanonicalName()); + } else { + kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getCanonicalName()); + kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getCanonicalName()); + } + kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); + // Create a unique string for the group.id using the pipeline name and the topic. + // group.id is a Kafka consumer property that uniquely identifies the group of + // consumer processes to which this consumer belongs. + String groupId = Joiner.on("-") + .join(pipelineName.length(), conf.getTopic().length(), pipelineName, conf.getTopic()); + kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); + kafkaParams.putAll(conf.getKafkaProperties()); + // change the request timeout to fetch the metadata to be 15 seconds or 1 second greater than session time out ms, + // since this config has to be greater than the session time out, which is by default 10 seconds + // the KafkaConsumer at runtime should still use the default timeout 305 seconds or whatever the user provides in + // kafkaConf + int requestTimeout = + Integer.parseInt(conf.getKafkaProperties().getOrDefault(CommonClientConfigs.REQUEST_TIMEOUT_MS_CONFIG, "20000")); + if (conf.getKafkaProperties().containsKey(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG)) { + int sessionTimeout = + Integer.parseInt(conf.getKafkaProperties().get(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG) + 1000); + requestTimeout = Math.max(requestTimeout, sessionTimeout); + } + kafkaParams.put(CommonClientConfigs.REQUEST_TIMEOUT_MS_CONFIG, requestTimeout); + return kafkaParams; + } + + @Nonnull + private static Map getOffsets(ConfluentStreamingSourceConfig conf, FailureCollector collector, + Consumer consumer) { + Map offsets = conf.getInitialPartitionOffsets( + getPartitions(consumer, conf, collector), collector); + collector.getOrThrowException(); + + // KafkaUtils doesn't understand -1 and -2 as smallest offset and latest offset. + // so we have to replace them with the actual smallest and latest + List earliestOffsetRequest = new ArrayList<>(); + List latestOffsetRequest = new ArrayList<>(); + for (Map.Entry entry : offsets.entrySet()) { + TopicPartition topicAndPartition = entry.getKey(); + Long offset = entry.getValue(); + if (offset == ListOffsetRequest.EARLIEST_TIMESTAMP) { + earliestOffsetRequest.add(topicAndPartition); + } else if (offset == ListOffsetRequest.LATEST_TIMESTAMP) { + latestOffsetRequest.add(topicAndPartition); + } + } + + Set allOffsetRequest = + Sets.newHashSet(Iterables.concat(earliestOffsetRequest, latestOffsetRequest)); + Map offsetsFound = new HashMap<>(); + offsetsFound.putAll(KafkaHelpers.getEarliestOffsets(consumer, earliestOffsetRequest)); + offsetsFound.putAll(KafkaHelpers.getLatestOffsets(consumer, latestOffsetRequest)); + for (TopicPartition topicAndPartition : allOffsetRequest) { + offsets.put(topicAndPartition, offsetsFound.get(topicAndPartition)); + } + + Set missingOffsets = Sets.difference(allOffsetRequest, offsetsFound.keySet()); + if (!missingOffsets.isEmpty()) { + throw new IllegalStateException(String.format( + "Could not find offsets for %s. Please check all brokers were included in the broker list.", missingOffsets)); + } + return offsets; + } + + private static Set getPartitions(Consumer consumer, ConfluentStreamingSourceConfig conf, + FailureCollector collector) { + Set partitions = conf.getPartitions(collector); + collector.getOrThrowException(); + + if (!partitions.isEmpty()) { + return partitions; + } + + partitions = new HashSet<>(); + for (PartitionInfo partitionInfo : consumer.partitionsFor(conf.getTopic())) { + partitions.add(partitionInfo.partition()); + } + return partitions; + } + + /** + * Applies the format function to each rdd. + */ + private static class AvroRecordTransform + implements Function2>, Time, JavaRDD> { + + private final ConfluentStreamingSourceConfig conf; + private final Schema outputSchema; + + AvroRecordTransform(ConfluentStreamingSourceConfig conf, Schema outputSchema) { + this.conf = conf; + this.outputSchema = outputSchema; + } + + @Override + public JavaRDD call(JavaRDD> input, Time batchTime) { + return input.map(new AvroFunction(batchTime.milliseconds(), conf, outputSchema)); + } + } + + /** + * Applies the format function to each rdd. + */ + private static class RecordTransform + implements Function2>, Time, JavaRDD> { + + private final ConfluentStreamingSourceConfig conf; + private final Schema outputSchema; + + RecordTransform(ConfluentStreamingSourceConfig conf, Schema outputSchema) { + this.conf = conf; + this.outputSchema = outputSchema; + } + + @Override + public JavaRDD call(JavaRDD> input, Time batchTime) { + Function, StructuredRecord> recordFunction = conf.getFormat() == null ? + new BytesFunction(batchTime.milliseconds(), conf, outputSchema) : + new FormatFunction(batchTime.milliseconds(), conf, outputSchema); + return input.map(recordFunction); + } + } + + /** + * Common logic for transforming kafka key, message, partition, and offset into a structured record. + * Everything here should be serializable, as Spark Streaming will serialize all functions. + */ + private abstract static class BaseFunction implements Function, StructuredRecord> { + protected final ConfluentStreamingSourceConfig conf; + private final long ts; + private final Schema outputSchema; + + BaseFunction(long ts, ConfluentStreamingSourceConfig conf, Schema outputSchema) { + this.ts = ts; + this.conf = conf; + this.outputSchema = outputSchema; + } + + @Override + public StructuredRecord call(ConsumerRecord in) throws Exception { + String timeField = conf.getTimeField(); + String keyField = conf.getKeyField(); + String partitionField = conf.getPartitionField(); + String offsetField = conf.getOffsetField(); + StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema); + if (timeField != null) { + builder.set(timeField, ts); + } + if (keyField != null) { + builder.set(keyField, convertKey(in.key())); + } + if (partitionField != null) { + builder.set(partitionField, in.partition()); + } + if (offsetField != null) { + builder.set(offsetField, in.offset()); + } + addMessage(builder, in.value()); + return builder.build(); + } + + protected abstract Object convertKey(K key); + + protected abstract void addMessage(StructuredRecord.Builder builder, V message) throws Exception; + } + + private abstract static class BinaryBaseFunction extends BaseFunction { + BinaryBaseFunction(long ts, ConfluentStreamingSourceConfig conf, Schema outputSchema) { + super(ts, conf, outputSchema); + } + + @Override + protected Object convertKey(byte[] key) { + if (key == null) { + return null; + } + Schema keySchemaNullable = conf.getSchema().getField(conf.getKeyField()).getSchema(); + Schema keySchema = keySchemaNullable.isNullable() ? keySchemaNullable.getNonNullable() : keySchemaNullable; + if (keySchema.getType() == Schema.Type.STRING) { + return new String(key, StandardCharsets.UTF_8); + } + if (keySchema.getType() == Schema.Type.BYTES) { + return key; + } + throw new IllegalStateException(String.format("Unexpected key type '%s'", keySchema.getDisplayName())); + } + } + + /** + * Transforms kafka key and message into a structured record when message format is not given. + * Everything here should be serializable, as Spark Streaming will serialize all functions. + */ + private static class BytesFunction extends BinaryBaseFunction { + private transient String messageField; + + BytesFunction(long ts, ConfluentStreamingSourceConfig conf, Schema outputSchema) { + super(ts, conf, outputSchema); + } + + @Override + protected void addMessage(StructuredRecord.Builder builder, byte[] message) { + builder.set(getMessageField(), message); + } + + private String getMessageField() { + if (messageField == null) { + for (Schema.Field field : conf.getSchema().getFields()) { + String name = field.getName(); + if (!name.equals(conf.getTimeField()) && !name.equals(conf.getKeyField()) + && !name.equals(conf.getOffsetField()) && !name.equals(conf.getPartitionField())) { + messageField = name; + break; + } + } + if (messageField == null) { + throw new IllegalStateException("No message field found in schema"); + } + } + return messageField; + } + } + + /** + * Transforms kafka key and message into a structured record when message format and schema are given. + * Everything here should be serializable, as Spark Streaming will serialize all functions. + */ + private static class FormatFunction extends BinaryBaseFunction { + private transient RecordFormat recordFormat; + + FormatFunction(long ts, ConfluentStreamingSourceConfig conf, Schema outputSchema) { + super(ts, conf, outputSchema); + } + + @Override + protected void addMessage(StructuredRecord.Builder builder, byte[] message) throws Exception { + // first time this was called, initialize record format + if (recordFormat == null) { + Schema messageSchema = conf.getMessageSchema(); + FormatSpecification spec = new FormatSpecification(conf.getFormat(), messageSchema, new HashMap<>()); + recordFormat = RecordFormats.createInitializedFormat(spec); + } + + StructuredRecord messageRecord = recordFormat.read(ByteBuffer.wrap(message)); + for (Schema.Field field : messageRecord.getSchema().getFields()) { + String fieldName = field.getName(); + builder.set(fieldName, messageRecord.get(fieldName)); + } + } + } + + private static class AvroFunction extends BaseFunction { + private transient AvroToStructuredTransformer transformer; + + AvroFunction(long ts, ConfluentStreamingSourceConfig conf, Schema outputSchema) { + super(ts, conf, outputSchema); + } + + @Override + protected Object convertKey(Object key) { + return key; + } + + @Override + protected void addMessage(StructuredRecord.Builder builder, Object message) throws Exception { + if (transformer == null) { + transformer = new AvroToStructuredTransformer(); + } + if (!(message instanceof GenericRecord)) { + throw new UnexpectedFormatException( + String.format("Unexpected message class '%s'", message.getClass().getName())); + } + GenericRecord genericRecord = (GenericRecord) message; + StructuredRecord messageRecord = transformer.transform(genericRecord); + builder.set(conf.getValueField(), messageRecord); + } + } +} diff --git a/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/StructuredRecordRepresentation.java b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/StructuredRecordRepresentation.java new file mode 100644 index 0000000..53c6a33 --- /dev/null +++ b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/StructuredRecordRepresentation.java @@ -0,0 +1,37 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent; + +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.format.StructuredRecordStringConverter; +import org.assertj.core.presentation.StandardRepresentation; + +import java.io.IOException; + +public class StructuredRecordRepresentation extends StandardRepresentation { + @Override + public String toStringOf(Object object) { + try { + if (object instanceof StructuredRecord) { + return StructuredRecordStringConverter.toJsonString((StructuredRecord) object); + } + return super.toStringOf(object); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } +} diff --git a/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/KafkaTestUtils.java b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/KafkaTestUtils.java new file mode 100644 index 0000000..06b1be2 --- /dev/null +++ b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/KafkaTestUtils.java @@ -0,0 +1,177 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.integration; + +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.plugin.format.avro.StructuredToAvroTransformer; +import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.CommonClientConfigs; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.config.SaslConfigs; +import org.apache.kafka.common.config.SslConfigs; +import org.apache.kafka.common.errors.TopicExistsException; +import org.apache.kafka.common.errors.UnknownTopicOrPartitionException; +import org.apache.kafka.common.serialization.ByteArraySerializer; +import org.apache.kafka.common.serialization.StringDeserializer; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.UUID; +import java.util.concurrent.ExecutionException; + +public class KafkaTestUtils { + private static final String KAFKA_SERVER = requireProperty("test.kafka_server"); + private static final String CLUSTER_API_KEY = requireProperty("test.cluster_api_key"); + private static final String CLUSTER_API_SECRET = requireProperty("test.cluster_api_secret"); + public static final String SR_URL = requireProperty("test.schema_registry_url"); + public static final String SR_API_KEY = requireProperty("test.schema_registry_api_key"); + public static final String SR_API_SECRET = requireProperty("test.schema_registry_api_secret"); + + private KafkaTestUtils() { + throw new AssertionError("Can not be initialized"); + } + + public static Consumer createConsumer() { + Map props = new HashMap<>(); + props.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_SERVER); + props.put(CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG, "500"); + props.put(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + props.putAll(getSecurityProps()); + return new KafkaConsumer<>(props, new StringDeserializer(), new StringDeserializer()); + } + + public static Consumer createConsumerForSchemaRegistry() { + Map props = new HashMap<>(); + props.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_SERVER); + props.put(CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG, "500"); + props.put(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString()); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + props.putAll(getSecurityProps()); + props.put("schema.registry.url", SR_URL); + props.put("basic.auth.credentials.source", "USER_INFO"); + props.put("schema.registry.basic.auth.user.info", SR_API_KEY + ":" + SR_API_SECRET); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getCanonicalName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaAvroDeserializer.class.getCanonicalName()); + return new KafkaConsumer<>(props); + } + + public static KafkaProducer createProducer() { + Map props = new HashMap<>(); + props.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_SERVER); + props.put(CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG, "500"); + props.put(ProducerConfig.ACKS_CONFIG, "all"); + props.putAll(getSecurityProps()); + return new KafkaProducer<>(props, new ByteArraySerializer(), new ByteArraySerializer()); + } + + public static KafkaProducer createProducerForSchemaRegistry() { + Map props = new HashMap<>(); + props.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_SERVER); + props.put(CommonClientConfigs.RETRY_BACKOFF_MS_CONFIG, "500"); + props.put(ProducerConfig.ACKS_CONFIG, "all"); + props.putAll(getSecurityProps()); + props.put("schema.registry.url", SR_URL); + props.put("basic.auth.credentials.source", "USER_INFO"); + props.put("schema.registry.basic.auth.user.info", SR_API_KEY + ":" + SR_API_SECRET); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, KafkaAvroSerializer.class.getCanonicalName()); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaAvroSerializer.class.getCanonicalName()); + return new KafkaProducer<>(props); + } + + // Create topic in Confluent Cloud + public static void createTopic(String topic, int partitions, int replication) { + NewTopic newTopic = new NewTopic(topic, partitions, (short) replication); + try (AdminClient adminClient = createAdminClient()) { + adminClient.createTopics(Collections.singletonList(newTopic)).all().get(); + } catch (InterruptedException | ExecutionException e) { + // Ignore if TopicExistsException, which may be valid if topic exists + if (!(e.getCause() instanceof TopicExistsException)) { + throw new RuntimeException(e); + } + } + } + + public static void deleteTopic(String topic) { + try (AdminClient adminClient = createAdminClient()) { + adminClient.deleteTopics(Collections.singletonList(topic)).all().get(); + } catch (InterruptedException | ExecutionException e) { + // Ignore if UnknownTopicOrPartitionException, which may be valid if topic does not exist + if (!(e.getCause() instanceof UnknownTopicOrPartitionException)) { + throw new RuntimeException(e); + } + } + } + + public static GenericRecord toGenericRecord(StructuredRecord structuredRecord, Schema schema) { + StructuredToAvroTransformer transformer = new StructuredToAvroTransformer(schema); + try { + return transformer.transform(structuredRecord); + } catch (IOException e) { + throw new IllegalStateException("Failed to convert records", e); + } + } + + public static List toGenericRecords(List structuredRecords, Schema schema) { + StructuredToAvroTransformer transformer = new StructuredToAvroTransformer(schema); + try { + List genericRecords = new ArrayList<>(); + for (StructuredRecord structuredRecord : structuredRecords) { + genericRecords.add(transformer.transform(structuredRecord)); + } + return genericRecords; + } catch (IOException e) { + throw new IllegalStateException("Failed to convert records", e); + } + } + + private static String requireProperty(String propertyName) { + return Objects.requireNonNull(System.getProperty(propertyName), + String.format("System property '%s' should be provided", propertyName)); + } + + private static AdminClient createAdminClient() { + Map props = new HashMap<>(); + props.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, KAFKA_SERVER); + props.putAll(getSecurityProps()); + return AdminClient.create(props); + } + + private static Map getSecurityProps() { + Map props = new HashMap<>(); + props.put(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, "SASL_SSL"); + props.put(SslConfigs.SSL_ENDPOINT_IDENTIFICATION_ALGORITHM_CONFIG, "https"); + props.put(SaslConfigs.SASL_MECHANISM, "PLAIN"); + props.put(SaslConfigs.SASL_JAAS_CONFIG, "org.apache.kafka.common.security.plain.PlainLoginModule required " + + "username=" + CLUSTER_API_KEY + " password=" + CLUSTER_API_SECRET + ";"); + return props; + } +} diff --git a/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/ConfluentStreamingTestBase.java b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/ConfluentStreamingTestBase.java new file mode 100644 index 0000000..a76bdb3 --- /dev/null +++ b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/ConfluentStreamingTestBase.java @@ -0,0 +1,136 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.integration.streaming; + +import io.cdap.cdap.api.artifact.ArtifactSummary; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.dataset.table.Table; +import io.cdap.cdap.common.conf.Constants; +import io.cdap.cdap.datastreams.DataStreamsApp; +import io.cdap.cdap.datastreams.DataStreamsSparkLauncher; +import io.cdap.cdap.etl.mock.batch.MockSink; +import io.cdap.cdap.etl.mock.test.HydratorTestBase; +import io.cdap.cdap.etl.proto.v2.DataStreamsConfig; +import io.cdap.cdap.etl.proto.v2.ETLPlugin; +import io.cdap.cdap.etl.proto.v2.ETLStage; +import io.cdap.cdap.etl.spark.Compat; +import io.cdap.cdap.proto.artifact.AppRequest; +import io.cdap.cdap.proto.id.ApplicationId; +import io.cdap.cdap.proto.id.ArtifactId; +import io.cdap.cdap.proto.id.NamespaceId; +import io.cdap.cdap.test.ApplicationManager; +import io.cdap.cdap.test.DataSetManager; +import io.cdap.cdap.test.SparkManager; +import io.cdap.cdap.test.TestConfiguration; +import io.cdap.plugin.confluent.StructuredRecordRepresentation; +import io.cdap.plugin.confluent.streaming.sink.ConfluentStreamingSink; +import io.cdap.plugin.confluent.streaming.source.ConfluentStreamingSource; +import io.confluent.kafka.serializers.KafkaAvroDeserializer; +import io.confluent.kafka.serializers.KafkaAvroSerializer; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.serialization.ByteArraySerializer; +import org.apache.spark.streaming.kafka010.KafkaUtils; +import org.assertj.core.api.Assertions; +import org.awaitility.Awaitility; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +public abstract class ConfluentStreamingTestBase extends HydratorTestBase { + @ClassRule + public static final TestConfiguration CONFIG = + new TestConfiguration(Constants.Explore.EXPLORE_ENABLED, false, + Constants.AppFabric.SPARK_COMPAT, Compat.SPARK_COMPAT); + private static final Logger LOG = LoggerFactory.getLogger(ConfluentStreamingTestBase.class); + private static final ArtifactId APP_ARTIFACT_ID = NamespaceId.DEFAULT.artifact("data-streams", "1.0.0"); + private static final ArtifactSummary APP_ARTIFACT = new ArtifactSummary("data-streams", "1.0.0"); + + @BeforeClass + public static void setupBasic() throws Exception { + LOG.info("Setting up application"); + + setupStreamingArtifacts(APP_ARTIFACT_ID, DataStreamsApp.class); + + LOG.info("Setting up plugins"); + addPluginArtifact( + NamespaceId.DEFAULT.artifact("confluent-kafka-plugins", "1.0.0"), + APP_ARTIFACT_ID, + ConfluentStreamingSource.class, ConfluentStreamingSink.class, + KafkaUtils.class, TopicPartition.class, + ByteArrayDeserializer.class, ByteArraySerializer.class, + KafkaAvroDeserializer.class, KafkaAvroSerializer.class + ); + } + + protected SparkManager deployETL(ETLPlugin sourcePlugin, ETLPlugin sinkPlugin, String appName) throws Exception { + ETLStage source = new ETLStage("source", sourcePlugin); + ETLStage sink = new ETLStage("sink", sinkPlugin); + DataStreamsConfig etlConfig = DataStreamsConfig.builder() + .addStage(source) + .addStage(sink) + .addConnection(source.getName(), sink.getName()) + .setBatchInterval("1s") + .setStopGracefully(true) + .build(); + + AppRequest appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig); + ApplicationId appId = NamespaceId.DEFAULT.app(appName); + ApplicationManager applicationManager = deployApplication(appId, appRequest); + return getProgramManager(applicationManager); + } + + protected List waitForRecords(String outputTable, int messageCount) throws Exception { + DataSetManager outputManager = getDataset(outputTable); + Awaitility.await().atMost(60, TimeUnit.SECONDS).untilAsserted(() -> { + List output = MockSink.readOutput(outputManager); + Assertions.assertThat(output) + .withRepresentation(new StructuredRecordRepresentation()) + .hasSizeGreaterThanOrEqualTo(messageCount); + }); + + List output = MockSink.readOutput(outputManager); + Assertions.assertThat(output) + .withRepresentation(new StructuredRecordRepresentation()) + .hasSize(messageCount); + return output; + } + + protected void waitForRecords(String outputTable, List expectedRecords) throws Exception { + DataSetManager
outputManager = getDataset(outputTable); + Awaitility.await().atMost(60, TimeUnit.SECONDS).untilAsserted(() -> { + List output = MockSink.readOutput(outputManager); + Assertions.assertThat(output) + .withRepresentation(new StructuredRecordRepresentation()) + .hasSizeGreaterThanOrEqualTo(expectedRecords.size()); + }); + + List output = MockSink.readOutput(outputManager); + + Assertions.assertThat(output) + .withRepresentation(new StructuredRecordRepresentation()) + .containsExactlyInAnyOrderElementsOf(expectedRecords); + } + + private SparkManager getProgramManager(ApplicationManager appManager) { + return appManager.getSparkManager(DataStreamsSparkLauncher.NAME); + } +} diff --git a/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/sink/ConfluentStreamingSinkTest.java b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/sink/ConfluentStreamingSinkTest.java new file mode 100644 index 0000000..16b33f5 --- /dev/null +++ b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/sink/ConfluentStreamingSinkTest.java @@ -0,0 +1,291 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.integration.streaming.sink; + +import com.google.common.base.Stopwatch; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.etl.api.batch.SparkSink; +import io.cdap.cdap.etl.mock.spark.streaming.MockSource; +import io.cdap.cdap.etl.proto.v2.ETLPlugin; +import io.cdap.cdap.proto.ProgramRunStatus; +import io.cdap.cdap.test.SparkManager; +import io.cdap.plugin.common.Constants; +import io.cdap.plugin.confluent.integration.KafkaTestUtils; +import io.cdap.plugin.confluent.integration.streaming.ConfluentStreamingTestBase; +import io.cdap.plugin.confluent.integration.streaming.source.ConfluentStreamingSourceTest; +import io.cdap.plugin.confluent.streaming.sink.ConfluentStreamingSink; +import io.cdap.plugin.confluent.streaming.sink.ConfluentStreamingSinkConfig; +import io.cdap.plugin.confluent.streaming.source.ConfluentStreamingSourceConfig; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.assertj.core.api.Assertions; +import org.assertj.core.api.SoftAssertions; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.TimeUnit; + +/** + * Tests for Confluent Streaming Sink plugin. + */ +public class ConfluentStreamingSinkTest extends ConfluentStreamingTestBase { + + private static Consumer kafkaConsumer; + private static Consumer kafkaAvroConsumer; + + @Rule + public TestName testName = new TestName(); + + private String topic; + private SparkManager programManager; + + @BeforeClass + public static void setupTestClass() { + kafkaConsumer = KafkaTestUtils.createConsumer(); + kafkaAvroConsumer = KafkaTestUtils.createConsumerForSchemaRegistry(); + } + + @AfterClass + public static void cleanupTestClass() { + kafkaConsumer.close(); + kafkaAvroConsumer.close(); + } + + @Before + public void setUp() { + topic = ConfluentStreamingSourceTest.class.getSimpleName() + "_" + testName.getMethodName(); + KafkaTestUtils.deleteTopic(topic); + KafkaTestUtils.createTopic(topic, 2, 3); + kafkaConsumer.subscribe(Collections.singletonList(topic)); + kafkaAvroConsumer.subscribe(Collections.singletonList(topic)); + } + + @After + public void tearDown() throws Exception { + KafkaTestUtils.deleteTopic(topic); + if (programManager != null) { + programManager.stop(); + programManager.waitForStopped(10, TimeUnit.SECONDS); + programManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS); + } + } + + @Test + public void testWritesWithFormat() throws Exception { + String keyField = "key"; + String partitionField = "partition"; + + Schema schema = Schema.recordOf( + "user", + Schema.Field.of("id", Schema.of(Schema.Type.LONG)), + Schema.Field.of("first", Schema.of(Schema.Type.STRING)), + Schema.Field.of("last", Schema.of(Schema.Type.STRING)), + Schema.Field.of(keyField, Schema.nullableOf(Schema.of(Schema.Type.STRING))), + Schema.Field.of(partitionField, Schema.of(Schema.Type.INT)) + ); + List records = Arrays.asList( + StructuredRecord.builder(schema) + .set("id", 1L) + .set("first", "samuel") + .set("last", "jackson") + .set(keyField, "a") + .set(partitionField, 0) + .build(), + StructuredRecord.builder(schema) + .set("id", 2L) + .set("first", "dwayne") + .set("last", "johnson") + .set(keyField, "b") + .set(partitionField, 1) + .build(), + StructuredRecord.builder(schema) + .set("id", 3L) + .set("first", "christopher") + .set("last", "walken") + .set(keyField, "c") + .set(partitionField, 1) + .build() + ); + ETLPlugin sourcePlugin = MockSource.getPlugin(schema, records); + Map properties = getConfigProperties(); + properties.put(ConfluentStreamingSourceConfig.NAME_FORMAT, "csv"); + properties.put(ConfluentStreamingSourceConfig.NAME_KEYFIELD, keyField); + properties.put(ConfluentStreamingSourceConfig.NAME_PARTITION_FIELD, partitionField); + programManager = deploySourcePlugin(sourcePlugin, properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + List> actualRecords = waitForRecordsInKafka(kafkaConsumer, 3); + SoftAssertions.assertSoftly(softly -> { + ConsumerRecord record1 = findRecordWithKey(actualRecords, "a"); + softly.assertThat(record1.value()).isEqualTo("1,samuel,jackson"); + softly.assertThat(record1.partition()).isEqualTo(0); + softly.assertThat(record1.offset()).isEqualTo(0); + + ConsumerRecord record2 = findRecordWithKey(actualRecords, "b"); + softly.assertThat(record2.value()).isEqualTo("2,dwayne,johnson"); + softly.assertThat(record2.partition()).isEqualTo(1); + softly.assertThat(record2.offset()).isEqualTo(0); + + ConsumerRecord record3 = findRecordWithKey(actualRecords, "c"); + softly.assertThat(record3.value()).isEqualTo("3,christopher,walken"); + softly.assertThat(record3.partition()).isEqualTo(1); + softly.assertThat(record3.offset()).isEqualTo(1); + }); + } + + @Test + public void testWritesWithSchemaRegistry() throws Exception { + String messageField = "message"; + String keyField = "key"; + String partitionField = "partition"; + + Schema valueSchema = Schema.recordOf( + "user", + Schema.Field.of("id", Schema.of(Schema.Type.LONG)), + Schema.Field.of("first", Schema.of(Schema.Type.STRING)), + Schema.Field.of("last", Schema.of(Schema.Type.STRING)) + ); + Schema schema = Schema.recordOf( + "confluent", + Schema.Field.of(messageField, valueSchema), + Schema.Field.of(keyField, Schema.nullableOf(Schema.of(Schema.Type.STRING))), + Schema.Field.of(partitionField, Schema.of(Schema.Type.INT)) + ); + StructuredRecord value1 = StructuredRecord.builder(valueSchema) + .set("id", 1L) + .set("first", "samuel") + .set("last", "jackson") + .build(); + StructuredRecord value2 = StructuredRecord.builder(valueSchema) + .set("id", 2L) + .set("first", "dwayne") + .set("last", "johnson") + .build(); + StructuredRecord value3 = StructuredRecord.builder(valueSchema) + .set("id", 3L) + .set("first", "christopher") + .set("last", "walken") + .build(); + List records = Arrays.asList( + StructuredRecord.builder(schema) + .set(messageField, value1) + .set(keyField, "a") + .set(partitionField, 0) + .build(), + StructuredRecord.builder(schema) + .set(messageField, value2) + .set(keyField, "b") + .set(partitionField, 1) + .build(), + StructuredRecord.builder(schema) + .set(messageField, value3) + .set(keyField, "c") + .set(partitionField, 1) + .build() + ); + ETLPlugin sourcePlugin = MockSource.getPlugin(schema, records); + Map properties = getConfigProperties(); + properties.put(ConfluentStreamingSourceConfig.NAME_KEYFIELD, keyField); + properties.put(ConfluentStreamingSourceConfig.NAME_PARTITION_FIELD, partitionField); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_URL, KafkaTestUtils.SR_URL); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_API_KEY, KafkaTestUtils.SR_API_KEY); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_API_SECRET, KafkaTestUtils.SR_API_SECRET); + programManager = deploySourcePlugin(sourcePlugin, properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + GenericRecord expectedValue1 = KafkaTestUtils.toGenericRecord(value1, value1.getSchema()); + GenericRecord expectedValue2 = KafkaTestUtils.toGenericRecord(value2, value2.getSchema()); + GenericRecord expectedValue3 = KafkaTestUtils.toGenericRecord(value3, value3.getSchema()); + + List> actualRecords = waitForRecordsInKafka(kafkaAvroConsumer, 3); + SoftAssertions.assertSoftly(softly -> { + ConsumerRecord record1 = findRecordWithKey(actualRecords, "a"); + softly.assertThat(record1.value()).isEqualTo(expectedValue1); + softly.assertThat(record1.partition()).isEqualTo(0); + softly.assertThat(record1.offset()).isEqualTo(0); + + ConsumerRecord record2 = findRecordWithKey(actualRecords, "b"); + softly.assertThat(record2.value()).isEqualTo(expectedValue2); + softly.assertThat(record2.partition()).isEqualTo(1); + softly.assertThat(record2.offset()).isEqualTo(0); + + ConsumerRecord record3 = findRecordWithKey(actualRecords, "c"); + softly.assertThat(record3.value()).isEqualTo(expectedValue3); + softly.assertThat(record3.partition()).isEqualTo(1); + softly.assertThat(record3.offset()).isEqualTo(1); + }); + } + + private ConsumerRecord findRecordWithKey(List> records, Object key) { + Optional> recordOptional = records.stream() + .filter(record -> Objects.equals(record.key(), key)) + .findAny(); + Assertions.assertThat(recordOptional).isPresent(); + return recordOptional.get(); + } + + private Map getConfigProperties() { + Map properties = new HashMap<>(); + properties.put(Constants.Reference.REFERENCE_NAME, "confluent"); + properties.put(ConfluentStreamingSinkConfig.NAME_BROKERS, KafkaTestUtils.KAFKA_SERVER); + properties.put(ConfluentStreamingSinkConfig.NAME_TOPIC, topic); + properties.put(ConfluentStreamingSinkConfig.NAME_ASYNC, "false"); + properties.put(ConfluentStreamingSinkConfig.NAME_COMPRESSION_TYPE, "none"); + properties.put(ConfluentStreamingSinkConfig.NAME_CLUSTER_API_KEY, KafkaTestUtils.CLUSTER_API_KEY); + properties.put(ConfluentStreamingSinkConfig.NAME_CLUSTER_API_SECRET, KafkaTestUtils.CLUSTER_API_SECRET); + return properties; + } + + private SparkManager deploySourcePlugin(ETLPlugin sourcePlugin, Map properties) throws Exception { + return deployETL( + sourcePlugin, + new ETLPlugin(ConfluentStreamingSink.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, properties, null), + "KafkaSinkApp" + ); + } + + private List> waitForRecordsInKafka(Consumer consumer, int expectedMessages) + throws Exception { + List> result = new ArrayList<>(); + Stopwatch stopwatch = new Stopwatch(); + while (result.size() < expectedMessages && stopwatch.elapsed(TimeUnit.SECONDS) < 10) { + ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); + for (ConsumerRecord record : records) { + result.add(record); + } + } + Assertions.assertThat(result).hasSize(expectedMessages); + return result; + } +} diff --git a/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/source/ConfluentStreamingSourceTest.java b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/source/ConfluentStreamingSourceTest.java new file mode 100644 index 0000000..fcc3d8f --- /dev/null +++ b/confluent-kafka-plugins/src/test/java/io/cdap/plugin/confluent/integration/streaming/source/ConfluentStreamingSourceTest.java @@ -0,0 +1,453 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.confluent.integration.streaming.source; + +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.api.data.schema.Schema; +import io.cdap.cdap.api.dataset.table.Table; +import io.cdap.cdap.etl.api.streaming.StreamingSource; +import io.cdap.cdap.etl.mock.batch.MockSink; +import io.cdap.cdap.etl.proto.v2.ETLPlugin; +import io.cdap.cdap.proto.ProgramRunStatus; +import io.cdap.cdap.test.DataSetManager; +import io.cdap.cdap.test.SparkManager; +import io.cdap.plugin.common.Constants; +import io.cdap.plugin.confluent.integration.KafkaTestUtils; +import io.cdap.plugin.confluent.integration.streaming.ConfluentStreamingTestBase; +import io.cdap.plugin.confluent.streaming.source.ConfluentStreamingSource; +import io.cdap.plugin.confluent.streaming.source.ConfluentStreamingSourceConfig; +import io.cdap.plugin.format.avro.StructuredToAvroTransformer; +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.requests.ListOffsetRequest; +import org.assertj.core.api.Assertions; +import org.assertj.core.api.SoftAssertions; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TestName; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import javax.annotation.Nullable; + +/** + * Tests for Confluent Streaming Source plugin. + */ +public class ConfluentStreamingSourceTest extends ConfluentStreamingTestBase { + + private static KafkaProducer kafkaProducer; + private static KafkaProducer kafkaAvroProducer; + + @Rule + public TestName testName = new TestName(); + + private String topic; + private String outputTable; + private SparkManager programManager; + + @BeforeClass + public static void setupTestClass() { + kafkaProducer = KafkaTestUtils.createProducer(); + kafkaAvroProducer = KafkaTestUtils.createProducerForSchemaRegistry(); + } + + @AfterClass + public static void cleanupTestClass() { + kafkaProducer.close(); + kafkaAvroProducer.close(); + } + + @Before + public void setUp() { + outputTable = testName.getMethodName() + "_out"; + topic = ConfluentStreamingSourceTest.class.getSimpleName() + "_" + testName.getMethodName(); + KafkaTestUtils.deleteTopic(topic); + KafkaTestUtils.createTopic(topic, 2, 3); + } + + @After + public void tearDown() throws Exception { + KafkaTestUtils.deleteTopic(topic); + if (programManager != null) { + programManager.stop(); + programManager.waitForStopped(10, TimeUnit.SECONDS); + programManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS); + } + } + + @Test + public void testConfluentStreamingSource() throws Exception { + Schema schema = Schema.recordOf( + "user", + Schema.Field.of("id", Schema.of(Schema.Type.LONG)), + Schema.Field.of("first", Schema.of(Schema.Type.STRING)), + Schema.Field.of("last", Schema.of(Schema.Type.STRING)) + ); + Map properties = getConfigProperties(schema); + properties.put(ConfluentStreamingSourceConfig.NAME_FORMAT, "csv"); + programManager = deploySourcePlugin(properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + sendKafkaMessage(topic, 0, "a", "1,samuel,jackson"); + + List expectedRecords = Collections.singletonList( + StructuredRecord.builder(schema) + .set("id", 1L) + .set("first", "samuel") + .set("last", "jackson") + .build() + ); + waitForRecords(outputTable, expectedRecords); + + programManager.stop(); + programManager.waitForStopped(10, TimeUnit.SECONDS); + + // clear the output table + DataSetManager
outputManager = getDataset(outputTable); + MockSink.clear(outputManager); + + // now write some more messages to kafka and start the program again to make sure it picks up where it left off + sendKafkaMessage(topic, 1, "b", "2,dwayne,johnson"); + + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + List expectedRecords2 = Collections.singletonList( + StructuredRecord.builder(schema) + .set("id", 2L) + .set("first", "dwayne") + .set("last", "johnson") + .build() + ); + waitForRecords(outputTable, expectedRecords2); + } + + @Test + public void testConfluentStreamingSourceAdditionalFieldsWithoutFormat() throws Exception { + String keyField = "key"; + String timeField = "time"; + String partitionField = "partition"; + String offsetField = "offset"; + Schema schema = Schema.recordOf( + "user", + Schema.Field.of("message", Schema.of(Schema.Type.BYTES)), + Schema.Field.of(keyField, Schema.of(Schema.Type.BYTES)), + Schema.Field.of(timeField, Schema.of(Schema.Type.LONG)), + Schema.Field.of(partitionField, Schema.of(Schema.Type.INT)), + Schema.Field.of(offsetField, Schema.of(Schema.Type.LONG)) + ); + Map properties = getConfigProperties(schema); + properties.put(ConfluentStreamingSourceConfig.NAME_KEYFIELD, keyField); + properties.put(ConfluentStreamingSourceConfig.NAME_TIMEFIELD, timeField); + properties.put(ConfluentStreamingSourceConfig.NAME_PARTITION_FIELD, partitionField); + properties.put(ConfluentStreamingSourceConfig.NAME_OFFSET_FIELD, offsetField); + programManager = deploySourcePlugin(properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + sendKafkaMessage(topic, 1, "a", "payload_1"); + sendKafkaMessage(topic, 0, "b", "payload_2"); + sendKafkaMessage(topic, 0, "c", "payload_3"); + + List actualRecords = waitForRecords(outputTable, 3); + + SoftAssertions.assertSoftly(softly -> { + StructuredRecord output1 = + findRecordWithField(actualRecords, keyField, ByteBuffer.wrap("a".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output1.get("message")) + .isEqualTo(ByteBuffer.wrap("payload_1".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output1.get(keyField)) + .isEqualTo(ByteBuffer.wrap("a".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output1.get(timeField)).isPositive(); + softly.assertThat(output1.get(partitionField)).isEqualTo(1); + softly.assertThat(output1.get(offsetField)).isEqualTo(0); + + StructuredRecord output2 = + findRecordWithField(actualRecords, keyField, ByteBuffer.wrap("b".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output2.get("message")) + .isEqualTo(ByteBuffer.wrap("payload_2".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output2.get(keyField)) + .isEqualTo(ByteBuffer.wrap("b".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output2.get(timeField)).isPositive(); + softly.assertThat(output2.get(partitionField)).isEqualTo(0); + softly.assertThat(output2.get(offsetField)).isEqualTo(0); + + StructuredRecord output3 = + findRecordWithField(actualRecords, keyField, ByteBuffer.wrap("c".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output3.get("message")) + .isEqualTo(ByteBuffer.wrap("payload_3".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output3.get(keyField)) + .isEqualTo(ByteBuffer.wrap("c".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output3.get(timeField)).isPositive(); + softly.assertThat(output3.get(partitionField)).isEqualTo(0); + softly.assertThat(output3.get(offsetField)).isEqualTo(1); + }); + } + + @Test + public void testConfluentStreamingSourceAdditionalFieldsWithFormat() throws Exception { + String keyField = "key"; + String timeField = "time"; + String partitionField = "partition"; + String offsetField = "offset"; + Schema schema = Schema.recordOf( + "user", + Schema.Field.of("id", Schema.of(Schema.Type.LONG)), + Schema.Field.of("first", Schema.of(Schema.Type.STRING)), + Schema.Field.of("last", Schema.of(Schema.Type.STRING)), + Schema.Field.of(keyField, Schema.of(Schema.Type.BYTES)), + Schema.Field.of(timeField, Schema.of(Schema.Type.LONG)), + Schema.Field.of(partitionField, Schema.of(Schema.Type.INT)), + Schema.Field.of(offsetField, Schema.of(Schema.Type.LONG)) + ); + Map properties = getConfigProperties(schema); + properties.put(ConfluentStreamingSourceConfig.NAME_FORMAT, "csv"); + properties.put(ConfluentStreamingSourceConfig.NAME_KEYFIELD, keyField); + properties.put(ConfluentStreamingSourceConfig.NAME_TIMEFIELD, timeField); + properties.put(ConfluentStreamingSourceConfig.NAME_PARTITION_FIELD, partitionField); + properties.put(ConfluentStreamingSourceConfig.NAME_OFFSET_FIELD, offsetField); + programManager = deploySourcePlugin(properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + sendKafkaMessage(topic, 1, "a", "1,samuel,jackson"); + sendKafkaMessage(topic, 0, "b", "2,dwayne,johnson"); + sendKafkaMessage(topic, 0, "c", "3,christopher,walken"); + + List actualRecords = waitForRecords(outputTable, 3); + + SoftAssertions.assertSoftly(softly -> { + StructuredRecord output1 = findRecordWithField(actualRecords, "id", 1L); + softly.assertThat(output1.get("id")).isEqualTo(1L); + softly.assertThat(output1.get("first")).isEqualTo("samuel"); + softly.assertThat(output1.get("last")).isEqualTo("jackson"); + softly.assertThat(output1.get(keyField)) + .isEqualTo(ByteBuffer.wrap("a".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output1.get(timeField)).isPositive(); + softly.assertThat(output1.get(partitionField)).isEqualTo(1); + softly.assertThat(output1.get(offsetField)).isEqualTo(0); + + StructuredRecord output2 = findRecordWithField(actualRecords, "id", 2L); + softly.assertThat(output2.get("id")).isEqualTo(2L); + softly.assertThat(output2.get("first")).isEqualTo("dwayne"); + softly.assertThat(output2.get("last")).isEqualTo("johnson"); + softly.assertThat(output2.get(keyField)) + .isEqualTo(ByteBuffer.wrap("b".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output2.get(timeField)).isPositive(); + softly.assertThat(output2.get(partitionField)).isEqualTo(0); + softly.assertThat(output2.get(offsetField)).isEqualTo(0); + + StructuredRecord output3 = findRecordWithField(actualRecords, "id", 3L); + softly.assertThat(output3.get("id")).isEqualTo(3L); + softly.assertThat(output3.get("first")).isEqualTo("christopher"); + softly.assertThat(output3.get("last")).isEqualTo("walken"); + softly.assertThat(output3.get(keyField)) + .isEqualTo(ByteBuffer.wrap("c".getBytes(StandardCharsets.UTF_8))); + softly.assertThat(output3.get(timeField)).isPositive(); + softly.assertThat(output3.get(partitionField)).isEqualTo(0); + softly.assertThat(output3.get(offsetField)).isEqualTo(1); + }); + } + + @Test + public void testConfluentStreamingSourceAdditionalFieldsWithSchemaRegistry() throws Exception { + String messageField = "message"; + String keyField = "key"; + String timeField = "time"; + String partitionField = "partition"; + String offsetField = "offset"; + Schema schema = Schema.recordOf( + "confluent", + Schema.Field.of(timeField, Schema.of(Schema.Type.LONG)), + Schema.Field.of(partitionField, Schema.of(Schema.Type.INT)), + Schema.Field.of(offsetField, Schema.of(Schema.Type.LONG)) + ); + Map properties = getConfigProperties(schema); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_URL, KafkaTestUtils.SR_URL); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_API_KEY, KafkaTestUtils.SR_API_KEY); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_API_SECRET, KafkaTestUtils.SR_API_SECRET); + properties.put(ConfluentStreamingSourceConfig.NAME_VALUE_FIELD, messageField); + properties.put(ConfluentStreamingSourceConfig.NAME_KEYFIELD, keyField); + properties.put(ConfluentStreamingSourceConfig.NAME_TIMEFIELD, timeField); + properties.put(ConfluentStreamingSourceConfig.NAME_PARTITION_FIELD, partitionField); + properties.put(ConfluentStreamingSourceConfig.NAME_OFFSET_FIELD, offsetField); + programManager = deploySourcePlugin(properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + Schema inputSchema = Schema.recordOf( + "user", + Schema.Field.of("id", Schema.of(Schema.Type.LONG)), + Schema.Field.of("first", Schema.of(Schema.Type.STRING)), + Schema.Field.of("last", Schema.of(Schema.Type.STRING)) + ); + StructuredRecord inputRecord1 = StructuredRecord.builder(inputSchema) + .set("id", 1L) + .set("first", "samuel") + .set("last", "jackson") + .build(); + StructuredRecord inputRecord2 = StructuredRecord.builder(inputSchema) + .set("id", 2L) + .set("first", "dwayne") + .set("last", "johnson") + .build(); + StructuredRecord inputRecord3 = StructuredRecord.builder(inputSchema) + .set("id", 3L) + .set("first", "christopher") + .set("last", "walken") + .build(); + StructuredToAvroTransformer transformer = new StructuredToAvroTransformer(inputSchema); + sendKafkaAvroMessage(topic, 1, "a", transformer.transform(inputRecord1)); + sendKafkaAvroMessage(topic, 0, "b", transformer.transform(inputRecord2)); + sendKafkaAvroMessage(topic, 0, "c", transformer.transform(inputRecord3)); + + List actualRecords = waitForRecords(outputTable, 3); + + SoftAssertions.assertSoftly(softly -> { + StructuredRecord output1 = findRecordWithField(actualRecords, keyField, "a"); + softly.assertThat(output1.get(messageField)).isEqualTo(inputRecord1); + softly.assertThat(output1.get(keyField)).isEqualTo("a"); + softly.assertThat(output1.get(timeField)).isPositive(); + softly.assertThat(output1.get(partitionField)).isEqualTo(1); + softly.assertThat(output1.get(offsetField)).isEqualTo(0); + + StructuredRecord output2 = findRecordWithField(actualRecords, keyField, "b"); + softly.assertThat(output2.get(messageField)).isEqualTo(inputRecord2); + softly.assertThat(output2.get(keyField)).isEqualTo("b"); + softly.assertThat(output2.get(timeField)).isPositive(); + softly.assertThat(output2.get(partitionField)).isEqualTo(0); + softly.assertThat(output2.get(offsetField)).isEqualTo(0); + + StructuredRecord output3 = findRecordWithField(actualRecords, keyField, "c"); + softly.assertThat(output3.get(messageField)).isEqualTo(inputRecord3); + softly.assertThat(output3.get(keyField)).isEqualTo("c"); + softly.assertThat(output3.get(timeField)).isPositive(); + softly.assertThat(output3.get(partitionField)).isEqualTo(0); + softly.assertThat(output3.get(offsetField)).isEqualTo(1); + }); + } + + @Test + public void testConfluentStreamingSourceWithSchemaRegistry() throws Exception { + String messageField = "message"; + Schema valueSchema = Schema.recordOf( + "user", + Schema.Field.of("id", Schema.of(Schema.Type.LONG)), + Schema.Field.of("first", Schema.of(Schema.Type.STRING)), + Schema.Field.of("last", Schema.of(Schema.Type.STRING)) + ); + Schema schema = Schema.recordOf( + "confluent", + Schema.Field.of(messageField, valueSchema) + ); + Map properties = getConfigProperties(schema); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_URL, KafkaTestUtils.SR_URL); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_API_KEY, KafkaTestUtils.SR_API_KEY); + properties.put(ConfluentStreamingSourceConfig.NAME_SR_API_SECRET, KafkaTestUtils.SR_API_SECRET); + properties.put(ConfluentStreamingSourceConfig.NAME_VALUE_FIELD, messageField); + programManager = deploySourcePlugin(properties); + programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS); + + List records = Arrays.asList( + StructuredRecord.builder(valueSchema) + .set("id", 1L) + .set("first", "samuel") + .set("last", "jackson") + .build(), + StructuredRecord.builder(valueSchema) + .set("id", 2L) + .set("first", "dwayne") + .set("last", "johnson") + .build(), + StructuredRecord.builder(valueSchema) + .set("id", 3L) + .set("first", "christopher") + .set("last", "walken") + .build() + ); + List genericRecords = KafkaTestUtils.toGenericRecords(records, valueSchema); + for (GenericRecord genericRecord : genericRecords) { + sendKafkaAvroMessage(topic, 0, null, genericRecord); + } + + List expectedRecords = records.stream() + .map(record -> StructuredRecord.builder(schema) + .set(messageField, record) + .build()) + .collect(Collectors.toList()); + waitForRecords(outputTable, expectedRecords); + } + + private StructuredRecord findRecordWithField(List records, String fieldName, Object value) { + Optional recordOptional = records.stream() + .filter(record -> Objects.equals(record.get(fieldName), value)) + .findAny(); + Assertions.assertThat(recordOptional).isPresent(); + return recordOptional.get(); + } + + private Map getConfigProperties(Schema schema) { + Map properties = new HashMap<>(); + properties.put(Constants.Reference.REFERENCE_NAME, "confluent"); + properties.put(ConfluentStreamingSourceConfig.NAME_BROKERS, KafkaTestUtils.KAFKA_SERVER); + properties.put(ConfluentStreamingSourceConfig.NAME_TOPIC, topic); + properties.put(ConfluentStreamingSourceConfig.NAME_DEFAULT_INITIAL_OFFSET, + String.valueOf(ListOffsetRequest.EARLIEST_TIMESTAMP)); + properties.put(ConfluentStreamingSourceConfig.NAME_CLUSTER_API_KEY, KafkaTestUtils.CLUSTER_API_KEY); + properties.put(ConfluentStreamingSourceConfig.NAME_CLUSTER_API_SECRET, KafkaTestUtils.CLUSTER_API_SECRET); + properties.put(ConfluentStreamingSourceConfig.NAME_SCHEMA, schema.toString()); + properties.put(ConfluentStreamingSourceConfig.NAME_MAX_RATE, "1000"); + return properties; + } + + private SparkManager deploySourcePlugin(Map properties) throws Exception { + return deployETL( + new ETLPlugin(ConfluentStreamingSource.PLUGIN_NAME, StreamingSource.PLUGIN_TYPE, properties, null), + MockSink.getPlugin(outputTable), + "KafkaSourceApp" + ); + } + + private void sendKafkaAvroMessage(String topic, @Nullable Integer partition, @Nullable Object key, Object value) { + try { + kafkaAvroProducer.send(new ProducerRecord<>(topic, partition, key, value)).get(); + } catch (InterruptedException | ExecutionException e) { + throw new IllegalStateException(e); + } + } + + private void sendKafkaMessage(String topic, @Nullable Integer partition, @Nullable String key, String value) { + byte[] valueBytes = value.getBytes(StandardCharsets.UTF_8); + byte[] keyBytes = key != null ? key.getBytes(StandardCharsets.UTF_8) : null; + try { + kafkaProducer.send(new ProducerRecord<>(topic, partition, keyBytes, valueBytes)).get(); + } catch (InterruptedException | ExecutionException e) { + throw new IllegalStateException(e); + } + } +} diff --git a/confluent-kafka-plugins/widgets/Confluent-sparksink.json b/confluent-kafka-plugins/widgets/Confluent-sparksink.json new file mode 100644 index 0000000..5ca921d --- /dev/null +++ b/confluent-kafka-plugins/widgets/Confluent-sparksink.json @@ -0,0 +1,146 @@ +{ + "metadata": { + "spec-version": "1.5" + }, + "display-name": "Confluent Kafka", + "configuration-groups": [ + { + "label": "Kafka Configuration", + "properties": [ + { + "widget-type": "textbox", + "label": "Reference Name", + "name": "referenceName" + }, + { + "widget-type": "csv", + "label": "Kafka Brokers", + "name": "brokers", + "widget-attributes": { + "delimiter": "," + } + }, + { + "widget-type": "textbox", + "label": "Kafka Topic", + "name": "topic" + }, + { + "widget-type": "toggle", + "label": "Async", + "name": "async", + "widget-attributes": { + "on": { + "value": "true", + "label": "Yes" + }, + "off": { + "value": "false", + "label": "No" + }, + "default": "false" + } + }, + { + "widget-type": "select", + "label": "Compression Type", + "name": "compressionType", + "widget-attributes": { + "values": [ + "none", + "gzip", + "snappy" + ], + "default": "none" + } + }, + { + "widget-type": "textbox", + "label": "Time Field", + "name": "timeField" + }, + { + "widget-type": "textbox", + "label": "Key Field", + "name": "keyField" + }, + { + "widget-type": "textbox", + "label": "Partition Field", + "name": "partitionField" + }, + { + "widget-type": "keyvalue", + "label": "Additional Kafka Producer Properties", + "name": "kafkaProperties", + "widget-attributes": { + "showDelimiter": "false", + "key-placeholder": "Kafka producer property", + "value-placeholder": "Kafka producer property value" + } + } + ] + }, + { + "label": "Authentication", + "properties": [ + { + "widget-type": "textbox", + "label": "Cluster API Key", + "name": "clusterApiKey" + }, + { + "widget-type": "textbox", + "label": "Cluster API Secret", + "name": "clusterApiSecret" + } + ] + }, + { + "label": "Schema Registry", + "properties": [ + { + "widget-type": "textbox", + "label": "Schema Registry URL", + "name": "schemaRegistryUrl" + }, + { + "widget-type": "textbox", + "label": "Schema Registry API Key", + "name": "schemaRegistryApiKey" + }, + { + "widget-type": "textbox", + "label": "Schema Registry API Secret", + "name": "schemaRegistryApiSecret" + } + ] + }, + { + "label": "Message Configuration", + "properties": [ + { + "widget-type": "select", + "label": "Message Format", + "name": "format", + "widget-attributes": { + "values": [ + "", + "CSV", + "JSON" + ], + "default": "" + } + } + ] + } + ], + "outputs": [], + "jump-config": { + "datasets": [ + { + "ref-property-name": "referenceName" + } + ] + } +} diff --git a/confluent-kafka-plugins/widgets/Confluent-streamingsource.json b/confluent-kafka-plugins/widgets/Confluent-streamingsource.json new file mode 100644 index 0000000..c6ae7c9 --- /dev/null +++ b/confluent-kafka-plugins/widgets/Confluent-streamingsource.json @@ -0,0 +1,201 @@ +{ + "metadata": { + "spec-version": "1.5" + }, + "display-name": "Confluent Kafka", + "configuration-groups": [ + { + "label": "Kafka Configuration", + "properties": [ + { + "widget-type": "textbox", + "label": "Reference Name", + "name": "referenceName" + }, + { + "widget-type": "csv", + "label": "Kafka Brokers", + "name": "brokers", + "widget-attributes": { + "delimiter": "," + } + }, + { + "widget-type": "connection-browser", + "widget-category": "plugin", + "widget-attributes": { + "connectionType": "KAFKA", + "label": "Browse" + } + }, + { + "widget-type": "textbox", + "label": "Kafka Topic", + "name": "topic" + }, + { + "widget-type": "csv", + "label": "Topic Partitions", + "name": "partitions", + "widget-attributes": { + "delimiter": "," + } + }, + { + "widget-type": "number", + "label": "Default Initial Offset", + "name": "defaultInitialOffset", + "widget-attributes": { + "default": -1 + } + }, + { + "widget-type": "keyvalue", + "label": "Initial Partition Offsets", + "name": "initialPartitionOffsets", + "widget-attributes": { + "showDelimiter": "false", + "key-placeholder": "Partition", + "value-placeholder": "Offset" + } + }, + { + "widget-type": "textbox", + "label": "Time Field", + "name": "timeField" + }, + { + "widget-type": "textbox", + "label": "Key Field", + "name": "keyField" + }, + { + "widget-type": "textbox", + "label": "Partition Field", + "name": "partitionField" + }, + { + "widget-type": "textbox", + "label": "Offset Field", + "name": "offsetField" + }, + { + "widget-type": "number", + "label": "Max Rate Per Partition", + "name": "maxRatePerPartition", + "widget-attributes": { + "default": 1000 + } + }, + { + "widget-type": "keyvalue", + "label": "Additional Kafka Consumer Properties", + "name": "kafkaProperties", + "widget-attributes": { + "showDelimiter": "false", + "key-placeholder": "Kafka consumer property", + "value-placeholder": "Kafka consumer property value" + } + } + ] + }, + { + "label": "Authentication", + "properties": [ + { + "widget-type": "textbox", + "label": "Cluster API Key", + "name": "clusterApiKey" + }, + { + "widget-type": "textbox", + "label": "Cluster API Secret", + "name": "clusterApiSecret" + } + ] + }, + { + "label": "Schema Registry", + "properties": [ + { + "widget-type": "textbox", + "label": "Schema Registry URL", + "name": "schemaRegistryUrl" + }, + { + "widget-type": "textbox", + "label": "Schema Registry API Key", + "name": "schemaRegistryApiKey" + }, + { + "widget-type": "textbox", + "label": "Schema Registry API Secret", + "name": "schemaRegistryApiSecret" + }, + { + "widget-type": "textbox", + "label": "Value Field", + "name": "valueField", + "plugin-function": { + "label": "Get Schema", + "widget": "outputSchema", + "position": "bottom", + "multiple-inputs": false, + "button-class": "btn-hydrator" + } + } + ] + }, + { + "label": "Message Configuration", + "properties": [ + { + "widget-type": "select", + "label": "Message Format", + "name": "format", + "widget-attributes": { + "values": [ + "", + "avro", + "binary", + "clf", + "csv", + "grok", + "syslog", + "text", + "tsv" + ], + "default": "" + } + } + ] + } + ], + "outputs": [ + { + "name": "schema", + "widget-type": "schema", + "widget-attributes": { + "default-schema": { + "name": "etlSchemaBody", + "type": "record", + "fields": [ + { + "name": "message", + "type": "string" + } + ] + }, + "schema-default-type": "string", + "property-watch": "format" + } + } + ], + "jump-config": { + "datasets": [ + { + "ref-property-name": "referenceName" + } + ] + } +} diff --git a/pom.xml b/pom.xml index cecadfa..89db105 100644 --- a/pom.xml +++ b/pom.xml @@ -21,6 +21,7 @@ kafka-plugins-0.8 kafka-plugins-0.10 + confluent-kafka-plugins kafka-plugins-common