diff --git a/site2/docs/admin-api-overview.md b/site2/docs/admin-api-overview.md index 37463156ebd7a..1767f66b643f7 100644 --- a/site2/docs/admin-api-overview.md +++ b/site2/docs/admin-api-overview.md @@ -142,8 +142,8 @@ Kubernetes requires a name that can be used as a DNS subdomain name as defined i :::tip -- If you get an error in translating Pulsar object names into Kubernetes resource labels (for example, you may have a naming collision if your Pulsar object name is too long) or want to customize the translating rules, see [customize Kubernetes runtime](/functions-runtime.md#customize-kubernetes-runtime). -- For how to configure Kubernetes runtime, see [here](/functions-runtime.md#configure-kubernetes-runtime). +- If you get an error in translating Pulsar object names into Kubernetes resource labels (for example, you may have a naming collision if your Pulsar object name is too long) or want to customize the translating rules, see [customize Kubernetes runtime](functions-runtime-kubernetes.md#customize-kubernetes-runtime). +- For how to configure Kubernetes runtime, see [instructions](functions-runtime-kubernetes.md). ::: diff --git a/site2/docs/assets/function-count-based-tumbling-window.png b/site2/docs/assets/function-count-based-tumbling-window.png new file mode 100644 index 0000000000000..c5ad6b613c5a8 Binary files /dev/null and b/site2/docs/assets/function-count-based-tumbling-window.png differ diff --git a/site2/docs/assets/function-data-window.png b/site2/docs/assets/function-data-window.png new file mode 100644 index 0000000000000..45c3cfca89411 Binary files /dev/null and b/site2/docs/assets/function-data-window.png differ diff --git a/site2/docs/assets/function-instance.svg b/site2/docs/assets/function-instance.svg new file mode 100644 index 0000000000000..16c1b3b3d72bb --- /dev/null +++ b/site2/docs/assets/function-instance.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/site2/docs/assets/function-overview.svg b/site2/docs/assets/function-overview.svg new file mode 100644 index 0000000000000..3d391ae6a783a --- /dev/null +++ b/site2/docs/assets/function-overview.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/site2/docs/assets/function-sliding-window.png b/site2/docs/assets/function-sliding-window.png new file mode 100644 index 0000000000000..bf66a761819e4 Binary files /dev/null and b/site2/docs/assets/function-sliding-window.png differ diff --git a/site2/docs/assets/function-time-based-tumbling-window.png b/site2/docs/assets/function-time-based-tumbling-window.png new file mode 100644 index 0000000000000..610347e2e5bbf Binary files /dev/null and b/site2/docs/assets/function-time-based-tumbling-window.png differ diff --git a/site2/docs/assets/function-worker-workflow.svg b/site2/docs/assets/function-worker-workflow.svg new file mode 100644 index 0000000000000..66b4d78791afc --- /dev/null +++ b/site2/docs/assets/function-worker-workflow.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/site2/docs/assets/function-workers-corun.svg b/site2/docs/assets/function-workers-corun.svg new file mode 100644 index 0000000000000..d11e15cdf8690 --- /dev/null +++ b/site2/docs/assets/function-workers-corun.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/site2/docs/assets/function-workers-separated-with-proxy.svg b/site2/docs/assets/function-workers-separated-with-proxy.svg new file mode 100644 index 0000000000000..627ba47643961 --- /dev/null +++ b/site2/docs/assets/function-workers-separated-with-proxy.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/site2/docs/assets/function-workers-separated.svg b/site2/docs/assets/function-workers-separated.svg new file mode 100644 index 0000000000000..d28df3bccaa41 --- /dev/null +++ b/site2/docs/assets/function-workers-separated.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/site2/docs/assets/functions-worker-corun-proxy.png b/site2/docs/assets/functions-worker-corun-proxy.png deleted file mode 100644 index 39429685f3890..0000000000000 Binary files a/site2/docs/assets/functions-worker-corun-proxy.png and /dev/null differ diff --git a/site2/docs/assets/functions-worker-corun.png b/site2/docs/assets/functions-worker-corun.png deleted file mode 100644 index 224d114fc48c6..0000000000000 Binary files a/site2/docs/assets/functions-worker-corun.png and /dev/null differ diff --git a/site2/docs/assets/functions-worker-separated-proxy.png b/site2/docs/assets/functions-worker-separated-proxy.png deleted file mode 100644 index b2f27140d9a1a..0000000000000 Binary files a/site2/docs/assets/functions-worker-separated-proxy.png and /dev/null differ diff --git a/site2/docs/assets/functions-worker-separated.png b/site2/docs/assets/functions-worker-separated.png deleted file mode 100644 index b710c34ce1a9a..0000000000000 Binary files a/site2/docs/assets/functions-worker-separated.png and /dev/null differ diff --git a/site2/docs/deploy-bare-metal.md b/site2/docs/deploy-bare-metal.md index 1c46febe15850..34aeca57531a5 100644 --- a/site2/docs/deploy-bare-metal.md +++ b/site2/docs/deploy-bare-metal.md @@ -547,7 +547,7 @@ bin/pulsar-admin functions create \ ``` -Check whether the function runs as expected by [triggering](functions-deploying.md#triggering-pulsar-functions) the function. +Check whether the function runs as expected by [triggering](functions-deploy-trigger.md) the function. ```bash diff --git a/site2/docs/functions-concepts.md b/site2/docs/functions-concepts.md new file mode 100644 index 0000000000000..a22bc9f97678f --- /dev/null +++ b/site2/docs/functions-concepts.md @@ -0,0 +1,196 @@ +--- +id: functions-concepts +title: Pulsar Functions concepts +sidebar_label: "Concepts" +--- + + +## Fully Qualified Function Name + +Each function has a Fully Qualified Function Name (FQFN) with a specified tenant, namespace, and function name. With FQFN, you can create multiple functions in different namespaces with the same function name. + +An FQFN looks like this: + +```text + +tenant/namespace/name + +``` + +## Function instance + +Function instance is the core element of the function execution framework, consisting of the following elements: +* A collection of consumers consuming messages from different input topics. +* An executor that invokes the function. +* A producer that sends the result of a function to an output topic. + +The following figure illustrates the internal workflow of a function instance. + +![Function instance](/assets/function-instance.svg) + +A function can have multiple instances, and each instance executes one copy of a function. You can specify the number of instances in the configuration file. + +The consumers inside a function instance use FQFN as subscriber names to enable load balancing between multiple instances based on subscription types. The subscription type can be specified at the function level. + +Each function has a separate state store with FQFN. You can specify a state interface to persist intermediate results in the BookKeeper. Other users can query the state of the function and extract these results. + + +## Function worker + +Function worker is a logic component to monitor, orchestrate, and execute individual function in [cluster-mode](functions-deploy.md#depoy-a-function-in-cluster-mode) deployment of Pulsar Functions. + +Within function workers, each [function instance](#function-instance) can be executed as a thread or process, depending on the selected configurations. Alternatively, if a Kubernetes cluster is available, functions can be spawned as StatefulSets within Kubernetes. See [Set up function workers](functions-worker.md) for more details. + +The following figure illustrates the internal architecture and workflow of function workers. + +![Function worker workflow](/assets/function-worker-workflow.svg) + +Function workers form a cluster of worker nodes and the workflow is described as follows. +1. User sends a request to the REST server to execute a function instance. +2. The REST server responds to the request and passes the request to the function metadata manager. +3. The function metadata manager writes the request updates to the function metadata topic. It also keeps track of all the metadata-related messages and uses the function metadata topic to persist the state updates of functions. +4. The function metadata manager reads updates from the function metadata topic and triggers the schedule manager to compute an assignment. +5. The schedule manager writes the assignment updates to the assignment topic. +6. The function runtime manager listens to the assignment topic, reads the assignment updates, and updates its internal state that contains a global view of all assignments for all workers. If the update changes the assignment on a worker, the function runtime manager materializes the new assignment by starting or stopping the execution of function instances. +7. The membership manager requests the coordination topic to elect a lead worker. All workers subscribe to the coordination topic in a failover subscription, but the active worker becomes the leader and performs the assignment, guaranteeing only one active consumer for this topic. +8. The membership manager reads updates from the coordination topic. + + +## Function runtime + +A [function instance](#function-instance) is invoked inside a runtime, and a number of instances can run in parallel. Pulsar supports three types of function runtime with different costs and isolation guarantees to maximize deployment flexibility. You can use one of them to run functions based on your needs. See [Configure function runtime](functions-runtime.md) for more details. + +The following table outlines the three types of function runtime. + +| Type | Description | +|--------|-----------------| +| Thread runtime | Each instance runs as a thread.

Since the code for thread mode is written in Java, it is **only** applicable to Java instances. When a function runs in thread mode, it runs on the same Java virtual machine (JVM) with a function worker. | +| Process runtime | Each instance runs as a process.

When a function runs in process mode, it runs on the same machine that the function worker runs.| +| Kubernetes runtime | Function is submitted as Kubernetes StatefulSet by workers and each function instance runs as a pod. Pulsar supports adding labels to the Kubernetes StatefulSets and services while launching functions, which facilitates selecting the target Kubernetes objects. | + + +## Processing guarantees and subscription types + +Pulsar provides three different messaging delivery semantics that you can apply to a function. + +| Delivery semantics | Description | Adopted subscription type | +|--------------------|-------------|---------------------------| +| **At-most-once** delivery | Each message sent to a function is processed at its best effort. There’s no guarantee that the message will be processed or not. | Shared | +| **At-least-once** delivery (default) | Each message sent to the function can be processed more than once (in case of a processing failure or redelivery).

If you create a function without specifying the `--processing-guarantees` flag, the function provides `at-least-once` delivery guarantee. | Shared | +| **Effectively-once** delivery | Each message sent to the function can be processed more than once but it has only one output. Duplicated messages are ignored.

`Effectively once` is achieved on top of `at-least-once` processing and guaranteed server-side deduplication. This means a state update can happen twice, but the same state update is only applied once, the other duplicated state update is discarded on the server-side. | Failover | + +:::tip + +* By default, Pulsar Functions provide `at-least-once` delivery guarantees. If you create a function without supplying a value for the `--processingGuarantees` flag, the function provides `at-least-once` guarantees. +* The `Exclusive` subscription type is **not** available in Pulsar Functions because: + * If there is only one instance, `exclusive` equals `failover`. + * If there are multiple instances, `exclusive` may crash and restart when functions restart. In this case, `exclusive` does not equal `failover`. Because when the master consumer disconnects, all non-acknowledged and subsequent messages are delivered to the next consumer in line. +* To change the subscription type from `shared` to `key_shared`, you can use the `—retain-key-ordering` option in [`pulsar-admin`](/tools/pulsar-admin/). + +::: + +You can set the processing guarantees for a function when you create the function. The following command creates a function with effectively-once guarantees applied. + +```bash + +bin/pulsar-admin functions create \ + --name my-effectively-once-function \ + --processing-guarantees EFFECTIVELY_ONCE \ + # Other function configs + +``` + +You can change the processing guarantees applied to a function using the `update` command. + +```bash + +bin/pulsar-admin functions update \ + --processing-guarantees ATMOST_ONCE \ + # Other function configs + +``` + +## Context + +Java, Python, and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function including: +* The name and ID of a function. +* The message ID of a message. Each message is automatically assigned with an ID. +* The key, event time, properties, and partition key of a message. +* The name of the topic that a message is sent to. +* The names of all input topics as well as the output topic associated with the function. +* The name of the class used for [SerDe](functions-develop-serde). +* The tenant and namespace associated with the function. +* The ID of the function instance running the function. +* The version of the function. +* The [logger object](functions-develop-log) used by the function, which is used to create log messages. +* Access to arbitrary [user configuration](functions-develop-user-defined-configs) values supplied via the CLI. +* An interface for recording [metrics](functions-develop-metrics). +* An interface for storing and retrieving state in [state storage](functions-develop-state). +* A function to publish new messages onto arbitrary topics. +* A function to acknowledge the message being processed (if auto-ack is disabled). +* (Java) get Pulsar admin client. + +:::tip + +For more information about code examples, refer to [Java](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/BaseContext.java), [Python](https://github.com/apache/pulsar/blob/master/pulsar-functions/instance/src/main/python/contextimpl.py) and [Go](https://github.com/apache/pulsar/blob/master/pulsar-function-go/pf/context.go). + +::: + +## Function message types + +Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. You can write typed functions and bind messages to types by using either of the following ways: +* [Schema Registry](functions-develop-schema-registry.md) +* [SerDe](functions-develop-serde.md) + + +## Window function + +:::note + +Currently, window function is only available in Java. + +::: + +Window function is a function that performs computation across a data window, that is, a finite subset of the event stream. As illustrated below, the stream is split into “buckets” where functions can be applied. + +![A window of data within an event stream](/assets/function-data-window.png) + +The definition of a data window for a function involves two policies: +* Eviction policy: Controls the amount of data collected in a window. +* Trigger policy: Controls when a function is triggered and executed to process all of the data collected in a window based on eviction policy. + +Both trigger policy and eviction policy are driven by either time or count. + +:::tip + +Both processing time and event time are supported. + * Processing time is defined based on the wall time when the function instance builds and processes a window. The judging of window completeness is straightforward and you don’t have to worry about data arrival disorder. + * Event time is defined based on the timestamps that come with the event record. It guarantees event time correctness but also offers more data buffering and a limited completeness guarantee. + +::: + +### Types of window + +Based on whether two adjacent windows can share common events or not, windows can be divided into the following two types: +* [Tumbling window](#tumbling-window) +* [Sliding window](#sliding-window) + +#### Tumbling window + +Tumbling window assigns elements to a window of a specified time length or count. The eviction policy for tumbling windows is always based on the window being full. So you only need to specify the trigger policy, either count-based or time-based. + +In a tumbling window with a count-based trigger policy, as illustrated in the following example, the trigger policy is set to 2. Each function is triggered and executed when two items are in the window, regardless of the time. + +![A tumbling window with a count-based trigger policy](/assets/function-count-based-tumbling-window.png) + +In contrast, as illustrated in the following example, the window length of the tumbling window is 10 seconds, which means the function is triggered when the 10-second time interval has elapsed, regardless of how many events are in the window. + +![A tumbling window with a time-based trigger policy](/assets/function-time-based-tumbling-window.png) + +#### Sliding window + +The sliding window method defines a fixed window length by setting the eviction policy to limit the amount of data retained for processing and setting the trigger policy with a sliding interval. If the sliding interval is smaller than the window length, there is data overlapping, which means the data simultaneously falling into adjacent windows is used for computation more than once. + +As illustrated in the following example, the window length is 2 seconds, which means that any data older than 2 seconds will be evicted and not used in the computation. The sliding interval is configured to be 1 second, which means that function is executed every second to process the data within the entire window length. + +![Sliding window with an overlap](/assets/function-sliding-window.png) \ No newline at end of file diff --git a/site2/docs/functions-debug-cli.md b/site2/docs/functions-debug-cli.md new file mode 100644 index 0000000000000..c0db546592192 --- /dev/null +++ b/site2/docs/functions-debug-cli.md @@ -0,0 +1,212 @@ +--- +id: functions-debug-cli +title: Debug with Functions CLI +sidebar_label: "Debug with Functions CLI" +--- + +With [Pulsar Functions CLI](/tools/pulsar-admin/), you can debug Pulsar Functions with the following subcommands: +* `get` +* `status` +* `stats` +* `list` +* `trigger` + +## `get` + +To get information about a function, you can specify `--fqfn` as follows. + +```bash + + ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 + +``` + +Alternatively, you can specify `--name`, `--namespace` and `--tenant` as follows. + +```bash + + ./bin/pulsar-admin functions get \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 + +``` + +As shown below, the `get` command shows input, output, runtime, and other information about the `_ExclamationFunctio6_` function. + +```json + +{ + "tenant": "public", + "namespace": "default", + "name": "ExclamationFunctio6", + "className": "org.example.test.ExclamationFunction", + "inputSpecs": { + "persistent://public/default/my-topic-1": { + "isRegexPattern": false + } + }, + "output": "persistent://public/default/test-1", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": {}, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1 +} + +``` + +## `list` + +To list all Pulsar Functions running under a specific tenant and namespace: + +```bash + +bin/pulsar-admin functions list \ + --tenant public \ + --namespace default + +``` + +As shown below, the `list` command returns three functions running under the `public` tenant and the `default` namespace. + +```text + +ExclamationFunctio1 +ExclamationFunctio2 +ExclamationFunctio3 + +``` + +## `status` + +To check the current status of a function: + +```bash + + ./bin/pulsar-admin functions status \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +As shown below, the `status` command shows the number of instances, running instances, the instance running under the `_ExclamationFunctio6_` function, received messages, successfully processed messages, system exceptions, the average latency and so on. + +```json + +{ + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 1, + "numSuccessfullyProcessed" : 1, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.8385, + "lastInvocationTime" : 1557734137987, + "workerId" : "c-standalone-fw-23ccc88ef29b-8080" + } + } ] +} + +``` + +## `stats` + +To get the current stats of a function: + +```bash + +bin/pulsar-admin functions stats \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + +``` + +The output is shown as follows: + +```json + +{ + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "instances" : [ { + "instanceId" : 0, + "metrics" : { + "receivedTotal" : 1, + "processedSuccessfullyTotal" : 1, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : 0.8385, + "1min" : { + "receivedTotal" : 0, + "processedSuccessfullyTotal" : 0, + "systemExceptionsTotal" : 0, + "userExceptionsTotal" : 0, + "avgProcessLatency" : null + }, + "lastInvocation" : 1557734137987, + "userMetrics" : { } + } + } ] +} + +``` + +## `trigger` + +To trigger a specified function with a supplied value: + +```bash + + ./bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name ExclamationFunctio6 \ + --topic persistent://public/default/my-topic-1 \ + --trigger-value "hello pulsar functions" + +``` + +This command simulates the execution process of a function and verifies it. As shown below, the `trigger` command returns the following result: + +```text + +This is my function! + +``` + +:::note + +When using the `--topic` option, you must specify the [entire topic name](getting-started-pulsar.md#topic-names). Otherwise, the following error occurs. + + ```text + + Function in trigger function has unidentified topic + + Reason: Function in trigger function has unidentified topic + + ``` + +::: diff --git a/site2/docs/functions-debug-localrun.md b/site2/docs/functions-debug-localrun.md new file mode 100644 index 0000000000000..5d218fec81802 --- /dev/null +++ b/site2/docs/functions-debug-localrun.md @@ -0,0 +1,68 @@ +--- +id: functions-debug-localrun +title: Debug with localrun mode +sidebar_label: "Debug with localrun mode" +--- + +In localrun mode, a function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. This provides a way to test your function and allow you to launch a function instance on your local machine as a thread for easy debugging. + +:::note + +Debugging with localrun mode is only available for Java functions in Pulsar 2.4.0 or later versions. + +::: + +Before using localrun mode, you need to add the following dependency. + +```xml + + + org.apache.pulsar + pulsar-functions-local-runner + ${pulsar.version} + + +``` + +For example, you can run your function in the following manner. + +```java + +FunctionConfig functionConfig = new FunctionConfig(); +functionConfig.setName(functionName); +functionConfig.setInputs(Collections.singleton(sourceTopic)); +functionConfig.setClassName(ExclamationFunction.class.getName()); +functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); +functionConfig.setOutput(sinkTopic); + +LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); +localRunner.start(true); + +``` + +You can debug functions using an IDE. Set breakpoints and manually step through a function to debug with real data. + +The following code example shows how to run a function in localrun mode. + +```java + +public class ExclamationFunction implements Function { + + @Override + public String process(String s, Context context) throws Exception { + return s + "!"; + } + +public static void main(String[] args) throws Exception { + FunctionConfig functionConfig = new FunctionConfig(); + functionConfig.setName("exclamation"); + functionConfig.setInputs(Collections.singleton("input")); + functionConfig.setClassName(ExclamationFunction.class.getName()); + functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); + functionConfig.setOutput("output"); + + LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); + localRunner.start(false); +} + +``` diff --git a/site2/docs/functions-debug-log-topic.md b/site2/docs/functions-debug-log-topic.md new file mode 100644 index 0000000000000..8434a0a5cabfc --- /dev/null +++ b/site2/docs/functions-debug-log-topic.md @@ -0,0 +1,50 @@ +--- +id: functions-debug-log-topic +title: Debug with logic topic +sidebar_label: "Debug with logic topic" +--- + +When using Pulsar Functions, you can generate logs predefined in functions to a specified log topic and configure consumers to consume messages from the log topic. + +For example, the following function logs either a WARNING-level or INFO-level log based on whether the incoming string contains the word `danger` or not. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +As shown in the example, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired logs in a function using the `LOG` variable. + +Meanwhile, you need to specify the topic that the logs can be produced to. The following is an example. + +```bash + +bin/pulsar-admin functions create \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +The message published to a log topic contains several properties: +- `loglevel`: the level of the log message. +- `fqn`: the fully qualified function name that pushes this log message. +- `instance`: the ID of the function instance that pushes this log message. diff --git a/site2/docs/functions-debug-stderr.md b/site2/docs/functions-debug-stderr.md new file mode 100644 index 0000000000000..a5659ad2b211a --- /dev/null +++ b/site2/docs/functions-debug-stderr.md @@ -0,0 +1,7 @@ +--- +id: functions-debug-stderr +title: Debug with captured stderr +sidebar_label: "Debug with captured stderr" +--- + +To debug why a function fails to start, you can find function startup information and captured stderr output in the `logs/functions////-.log` file. diff --git a/site2/docs/functions-debug-unit-test.md b/site2/docs/functions-debug-unit-test.md new file mode 100644 index 0000000000000..c6a6afebea0f1 --- /dev/null +++ b/site2/docs/functions-debug-unit-test.md @@ -0,0 +1,71 @@ +--- +id: functions-debug-unit-test +title: Debug with unit test +sidebar_label: "Debug with unit test" +--- + + +Like any function with inputs and outputs, you can test Pulsar Functions in a similar way as you test any other function. + +:::note + +Pulsar uses TestNG for testing. + +::: + +For example, if you have the following function written through the language-native interface for Java: + +```java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +You can write a simple unit test to test the function. + +```java + +@Test +public void testJavaNativeExclamationFunction() { + JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); + String output = exclamation.apply("foo"); + Assert.assertEquals(output, "foo!"); +} + +``` + +The following example is written through the Java SDK. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +You can write a unit test to test this function and mock the `Context` parameter as follows. + +```java + +@Test +public void testExclamationFunction() { + ExclamationFunction exclamation = new ExclamationFunction(); + String output = exclamation.process("foo", mock(Context.class)); + Assert.assertEquals(output, "foo!"); +} + +``` diff --git a/site2/docs/functions-debug.md b/site2/docs/functions-debug.md index 6e80b1a60d646..40217cd770266 100644 --- a/site2/docs/functions-debug.md +++ b/site2/docs/functions-debug.md @@ -1,537 +1,12 @@ --- id: functions-debug title: Debug Pulsar Functions -sidebar_label: "How-to: Debug" +sidebar_label: "How to debug" --- You can use the following methods to debug Pulsar Functions: - -* [Captured stderr](functions-debug.md#captured-stderr) -* [Use unit test](functions-debug.md#use-unit-test) -* [Debug with localrun mode](functions-debug.md#debug-with-localrun-mode) -* [Use log topic](functions-debug.md#use-log-topic) -* [Use Functions CLI](functions-debug.md#use-functions-cli) - -## Captured stderr - -Function startup information and captured stderr output is written to `logs/functions////-.log` - -This is useful for debugging why a function fails to start. - -## Use unit test - -A Pulsar Function is a function with inputs and outputs, you can test a Pulsar Function in a similar way as you test any function. - -For example, if you have the following Pulsar Function: - -```java - -import java.util.function.Function; - -public class JavaNativeExclamationFunction implements Function { - @Override - public String apply(String input) { - return String.format("%s!", input); - } -} - -``` - -You can write a simple unit test to test Pulsar Function. - -:::tip - -Pulsar uses testng for testing. - -::: - -```java - -@Test -public void testJavaNativeExclamationFunction() { - JavaNativeExclamationFunction exclamation = new JavaNativeExclamationFunction(); - String output = exclamation.apply("foo"); - Assert.assertEquals(output, "foo!"); -} - -``` - -The following Pulsar Function implements the `org.apache.pulsar.functions.api.Function` interface. - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; - -public class ExclamationFunction implements Function { - @Override - public String process(String input, Context context) { - return String.format("%s!", input); - } -} - -``` - -In this situation, you can write a unit test for this function as well. Remember to mock the `Context` parameter. The following is an example. - -:::tip - -Pulsar uses testng for testing. - -::: - -```java - -@Test -public void testExclamationFunction() { - ExclamationFunction exclamation = new ExclamationFunction(); - String output = exclamation.process("foo", mock(Context.class)); - Assert.assertEquals(output, "foo!"); -} - -``` - -## Debug with localrun mode -When you run a Pulsar Function in localrun mode, it launches an instance of the Function on your local machine as a thread. - -In this mode, a Pulsar Function consumes and produces actual data to a Pulsar cluster, and mirrors how the function actually runs in a Pulsar cluster. - -:::note - -Currently, debugging with localrun mode is only supported by Pulsar Functions written in Java. You need Pulsar version 2.4.0 or later to do the following. Even though localrun is available in versions earlier than Pulsar 2.4.0, you cannot debug with localrun mode programmatically or run Functions as threads. - -::: - -You can launch your function in the following manner. - -```java - -FunctionConfig functionConfig = new FunctionConfig(); -functionConfig.setName(functionName); -functionConfig.setInputs(Collections.singleton(sourceTopic)); -functionConfig.setClassName(ExclamationFunction.class.getName()); -functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); -functionConfig.setOutput(sinkTopic); - -LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); -localRunner.start(true); - -``` - -So you can debug functions using an IDE easily. Set breakpoints and manually step through a function to debug with real data. - -The following example illustrates how to programmatically launch a function in localrun mode. - -```java - -public class ExclamationFunction implements Function { - - @Override - public String process(String s, Context context) throws Exception { - return s + "!"; - } - -public static void main(String[] args) throws Exception { - FunctionConfig functionConfig = new FunctionConfig(); - functionConfig.setName("exclamation"); - functionConfig.setInputs(Collections.singleton("input")); - functionConfig.setClassName(ExclamationFunction.class.getName()); - functionConfig.setRuntime(FunctionConfig.Runtime.JAVA); - functionConfig.setOutput("output"); - - LocalRunner localRunner = LocalRunner.builder().functionConfig(functionConfig).build(); - localRunner.start(false); -} - -``` - -To use localrun mode programmatically, add the following dependency. - -```xml - - - org.apache.pulsar - pulsar-functions-local-runner - ${pulsar.version} - - -``` - -For complete code samples, see [here](https://github.com/jerrypeng/pulsar-functions-demos/tree/master/debugging). - -:::note - -Debugging with localrun mode for Pulsar Functions written in other languages will be supported soon. - -::: - -## Use log topic - -In Pulsar Functions, you can generate log information defined in functions to a specified log topic. You can configure consumers to consume messages from a specified log topic to check the log information. - -![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) - -**Example** - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; -import org.slf4j.Logger; - -public class LoggingFunction implements Function { - @Override - public void apply(String input, Context context) { - Logger LOG = context.getLogger(); - String messageId = new String(context.getMessageId()); - - if (input.contains("danger")) { - LOG.warn("A warning was received in message {}", messageId); - } else { - LOG.info("Message {} received\nContent: {}", messageId, input); - } - - return null; - } -} - -``` - -As shown in the example above, you can get the logger via `context.getLogger()` and assign the logger to the `LOG` variable of `slf4j`, so you can define your desired log information in a function using the `LOG` variable. Meanwhile, you need to specify the topic to which the log information is produced. - -**Example** - -```bash - -$ bin/pulsar-admin functions create \ - --log-topic persistent://public/default/logging-function-logs \ - # Other function configs - -``` - -The message published to log topic contains several properties for better reasoning: -- `loglevel` -- the level of the log message. -- `fqn` -- fully qualified function name pushes this log message. -- `instance` -- the ID of the function instance pushes this log message. - -## Use Functions CLI - -With [Pulsar Functions CLI](/tools/pulsar-admin/), you can debug Pulsar Functions with the following subcommands: - -* `get` -* `status` -* `stats` -* `list` -* `trigger` - -:::tip - -For complete commands of **Pulsar Functions CLI**, see [here](/tools/pulsar-admin/)。 - -::: - -### `get` - -Get information about a Pulsar Function. - -**Usage** - -```bash - -$ pulsar-admin functions get options - -``` - -**Options** - -|Flag|Description -|---|--- -|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. -|`--name`|The name of a Pulsar Function. -|`--namespace`|The namespace of a Pulsar Function. -|`--tenant`|The tenant of a Pulsar Function. - -:::tip - -`--fqfn` consists of `--name`, `--namespace` and `--tenant`, so you can specify either `--fqfn` or `--name`, `--namespace` and `--tenant`. - -::: - -**Example** - -You can specify `--fqfn` to get information about a Pulsar Function. - -```bash - -$ ./bin/pulsar-admin functions get public/default/ExclamationFunctio6 - -``` - -Optionally, you can specify `--name`, `--namespace` and `--tenant` to get information about a Pulsar Function. - -```bash - -$ ./bin/pulsar-admin functions get \ - --tenant public \ - --namespace default \ - --name ExclamationFunctio6 - -``` - -As shown below, the `get` command shows input, output, runtime, and other information about the _ExclamationFunctio6_ function. - -```json - -{ - "tenant": "public", - "namespace": "default", - "name": "ExclamationFunctio6", - "className": "org.example.test.ExclamationFunction", - "inputSpecs": { - "persistent://public/default/my-topic-1": { - "isRegexPattern": false - } - }, - "output": "persistent://public/default/test-1", - "processingGuarantees": "ATLEAST_ONCE", - "retainOrdering": false, - "userConfig": {}, - "runtime": "JAVA", - "autoAck": true, - "parallelism": 1 -} - -``` - -### `status` - -Check the current status of a Pulsar Function. - -**Usage** - -```bash - -$ pulsar-admin functions status options - -``` - -**Options** - -|Flag|Description -|---|--- -|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. -|`--instance-id`|The instance ID of a Pulsar Function
If the `--instance-id` is not specified, it gets the IDs of all instances.
-|`--name`|The name of a Pulsar Function. -|`--namespace`|The namespace of a Pulsar Function. -|`--tenant`|The tenant of a Pulsar Function. - -**Example** - -```bash - -$ ./bin/pulsar-admin functions status \ - --tenant public \ - --namespace default \ - --name ExclamationFunctio6 \ - -``` - -As shown below, the `status` command shows the number of instances, running instances, the instance running under the _ExclamationFunctio6_ function, received messages, successfully processed messages, system exceptions, the average latency and so on. - -```json - -{ - "numInstances" : 1, - "numRunning" : 1, - "instances" : [ { - "instanceId" : 0, - "status" : { - "running" : true, - "error" : "", - "numRestarts" : 0, - "numReceived" : 1, - "numSuccessfullyProcessed" : 1, - "numUserExceptions" : 0, - "latestUserExceptions" : [ ], - "numSystemExceptions" : 0, - "latestSystemExceptions" : [ ], - "averageLatency" : 0.8385, - "lastInvocationTime" : 1557734137987, - "workerId" : "c-standalone-fw-23ccc88ef29b-8080" - } - } ] -} - -``` - -### `stats` - -Get the current stats of a Pulsar Function. - -**Usage** - -```bash - -$ pulsar-admin functions stats options - -``` - -**Options** - -|Flag|Description -|---|--- -|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. -|`--instance-id`|The instance ID of a Pulsar Function.
If the `--instance-id` is not specified, it gets the IDs of all instances.
-|`--name`|The name of a Pulsar Function. -|`--namespace`|The namespace of a Pulsar Function. -|`--tenant`|The tenant of a Pulsar Function. - -**Example** - -```bash - -$ ./bin/pulsar-admin functions stats \ - --tenant public \ - --namespace default \ - --name ExclamationFunctio6 \ - -``` - -The output is shown as follows: - -```json - -{ - "receivedTotal" : 1, - "processedSuccessfullyTotal" : 1, - "systemExceptionsTotal" : 0, - "userExceptionsTotal" : 0, - "avgProcessLatency" : 0.8385, - "1min" : { - "receivedTotal" : 0, - "processedSuccessfullyTotal" : 0, - "systemExceptionsTotal" : 0, - "userExceptionsTotal" : 0, - "avgProcessLatency" : null - }, - "lastInvocation" : 1557734137987, - "instances" : [ { - "instanceId" : 0, - "metrics" : { - "receivedTotal" : 1, - "processedSuccessfullyTotal" : 1, - "systemExceptionsTotal" : 0, - "userExceptionsTotal" : 0, - "avgProcessLatency" : 0.8385, - "1min" : { - "receivedTotal" : 0, - "processedSuccessfullyTotal" : 0, - "systemExceptionsTotal" : 0, - "userExceptionsTotal" : 0, - "avgProcessLatency" : null - }, - "lastInvocation" : 1557734137987, - "userMetrics" : { } - } - } ] -} - -``` - -### `list` - -List all Pulsar Functions running under a specific tenant and namespace. - -**Usage** - -```bash - -$ pulsar-admin functions list options - -``` - -**Options** - -|Flag|Description -|---|--- -|`--namespace`|The namespace of a Pulsar Function. -|`--tenant`|The tenant of a Pulsar Function. - -**Example** - -```bash - -$ ./bin/pulsar-admin functions list \ - --tenant public \ - --namespace default - -``` - -As shown below, the `list` command returns three functions running under the _public_ tenant and the _default_ namespace. - -```text - -ExclamationFunctio1 -ExclamationFunctio2 -ExclamationFunctio3 - -``` - -### `trigger` - -Trigger a specified Pulsar Function with a supplied value. This command simulates the execution process of a Pulsar Function and verifies it. - -**Usage** - -```bash - -$ pulsar-admin functions trigger options - -``` - -**Options** - -|Flag|Description -|---|--- -|`--fqfn`|The Fully Qualified Function Name (FQFN) of a Pulsar Function. -|`--name`|The name of a Pulsar Function. -|`--namespace`|The namespace of a Pulsar Function. -|`--tenant`|The tenant of a Pulsar Function. -|`--topic`|The topic name that a Pulsar Function consumes from. -|`--trigger-file`|The path to a file that contains the data to trigger a Pulsar Function. -|`--trigger-value`|The value to trigger a Pulsar Function. - -**Example** - -```bash - -$ ./bin/pulsar-admin functions trigger \ - --tenant public \ - --namespace default \ - --name ExclamationFunctio6 \ - --topic persistent://public/default/my-topic-1 \ - --trigger-value "hello pulsar functions" - -``` - -As shown below, the `trigger` command returns the following result: - -```text - -This is my function! - -``` - -:::note - -You must specify the [entire topic name](getting-started-pulsar.md#topic-names) when using the `--topic` option. Otherwise, the following error occurs. - -```text - -Function in trigger function has unidentified topic -Reason: Function in trigger function has unidentified topic - -``` - -::: - +* [Debug with captured stderr](functions-debug-stderr) +* [Debug with unit test](functions-debug-unit-test) +* [Debug with localrun mode](functions-debug-localrun) +* [Debug with log topic](functions-debug-log-topic) +* [Debug with Functions CLI](functions-debug-cli) \ No newline at end of file diff --git a/site2/docs/functions-deploy-arguments.md b/site2/docs/functions-deploy-arguments.md new file mode 100644 index 0000000000000..9329cf7908d4f --- /dev/null +++ b/site2/docs/functions-deploy-arguments.md @@ -0,0 +1,34 @@ +--- +id: functions-deploy-arguments +title: Default arguments of CLI +sidebar_label: "Default arguments of CLI" +--- + +You can use function-related commands in the [`pulsar-admin`](/tools/pulsar-admin/) CLI to deploy functions. Pulsar provides a variety of commands, such as: +* `create` command for deploying functions in [cluster mode](functions-deploy-cluster.md) +* `trigger` command for [triggering](functions-deploy-trigger.md) functions + +The following table lists the parameters required in CLI and their default values. + +| Parameter | Default value| +|----------|----------------| +| Function name | N/A
You can specify any value for the function name (except org, library, or similar class names). +Tenant | N/A
The value is derived from the name of the input topics. For example, if the input topic form is `persistent://marketing/{namespace}/{topicName}`, the tenant name is `marketing`.| +| Namespace | N/A
The value is derived from the input topic name. If the input topic form is `persistent://marketing/asia/{topicName}`, the namespace is `asia`.| +| Output topic | `{input topic}-{function name}-output`. For example, if an input topic name of a function is `incoming` and the function name is `exclamation`, the output topic name is `incoming-exclamation-output`.| +| [Processing guarantees](functions-concepts.md#processing-guarantees-and-subscription-types) | `ATLEAST_ONCE` | +| Pulsar service URL | `pulsar://localhost:6650`| + + +Take the `create` command for example. The following function has default values for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). + +```bash + +bin/pulsar-admin functions create \ + --jar my-pulsar-functions.jar \ + --classname org.example.MyFunction \ + --inputs my-function-input-topic1,my-function-input-topic2 + +``` + + diff --git a/site2/docs/functions-deploy-cluster-encryption.md b/site2/docs/functions-deploy-cluster-encryption.md new file mode 100644 index 0000000000000..ecf843fec996d --- /dev/null +++ b/site2/docs/functions-deploy-cluster-encryption.md @@ -0,0 +1,28 @@ +--- +id: functions-deploy-cluster-encryption +title: Enable end-to-end-encryption +sidebar_label: "Enable end-to-end-encryption" +--- + +To perform end-to-end [encryption](security-encryption.md), you can specify `--producer-config` and `--input-specs` in the [`pulsar-admin`](/tools/pulsar-admin/) CLI with the public and private key pair configured by the application. Only the consumers with a valid key can decrypt the encrypted messages. + +The encryption/decryption relevant configuration [`CryptoConfig`](functions-cli.md) is included in both `ProducerConfig` and `inputSpecs`. The specific configurable fields about `CryptoConfig` are as follows: + +```text + +public class CryptoConfig { + private String cryptoKeyReaderClassName; + private Map cryptoKeyReaderConfig; + + private String[] encryptionKeys; + private ProducerCryptoFailureAction producerCryptoFailureAction; + + private ConsumerCryptoFailureAction consumerCryptoFailureAction; +} + +``` + +- `producerCryptoFailureAction` defines the action that a producer takes if it fails to encrypt the data. Available options are `FAIL` or `SEND`. +- `consumerCryptoFailureAction` defines the action that a consumer takes if it fails to decrypt the recieved data. Available options are `FAIL`, `DISCARD`, or `CONSUME`. + +For more information about these options, refer to [producer configurations](client-libraries-java.md#configure-producer.md) and [consumer configurations](client-libraries-java.md#configure-consumer). diff --git a/site2/docs/functions-deploy-cluster-package.md b/site2/docs/functions-deploy-cluster-package.md new file mode 100644 index 0000000000000..0295a785e6cca --- /dev/null +++ b/site2/docs/functions-deploy-cluster-package.md @@ -0,0 +1,26 @@ +--- +id: functions-deploy-cluster-package +title: Enable package management service +sidebar_label: "Enable package management service" +--- + +[Package management service](admin-api-packages) enables both version management and simplified upgrade/rollback processes for functions, sinks, and sources. When using the same function, sink, and source in different namespaces, you can upload them to a common package management system. + +With the package management service enabled, you can [upload your function package](/tools/pulsar-admin/) to the service and get the package URL. Thus you can create the function by setting `--jar`, `--py`, or `--go` to the package URL. + +By default, the package management service is disabled. To enable it in your cluster, set the properties in the `conf/broker.conf` file as follows. + +```conf + +enablePackagesManagement=true +packagesManagementStorageProvider=org.apache.pulsar.packages.management.storage.bookkeeper.BookKeeperPackagesStorageProvider +packagesReplicas=1 +packagesManagementLedgerRootPath=/ledgers + +``` + +:::tip + +To ensure high availability in a production deployment (a cluster with multiple brokers), set `packagesReplicas` to equal the number of bookies. The default value `1` is only for one-node cluster deployment. + +::: diff --git a/site2/docs/functions-deploy-cluster-parallelism.md b/site2/docs/functions-deploy-cluster-parallelism.md new file mode 100644 index 0000000000000..f0b1fce19891f --- /dev/null +++ b/site2/docs/functions-deploy-cluster-parallelism.md @@ -0,0 +1,51 @@ +--- +id: functions-deploy-cluster-parallelism +title: Enable parallel processing +sidebar_label: "Enable parallel processing" +--- + +In cluster mode, you can specify the **parallelism** (the number of instances to run) to enable parallel processing for a function. + +**Example1** + +Specify the `--parallelism` flag of the `create` command when deploying a function. + +```bash + +bin/pulsar-admin functions create \ + --parallelism 3 \ + # Other function info + +``` + +:::tip + +For an existing function, you can adjust the parallelism by using the `update` command. + +::: + + +**Example2** + +**Specify the `parallelism` parameter when deploying a function configuration through YAML. + +```yaml + +# function-config.yaml +parallelism: 3 +inputs: +- persistent://public/default/input-1 +output: persistent://public/default/output-1 +# other parameters + +``` + +For an existing function, you can adjust the parallelism by using the `update` command as follows. + +```bash + +bin/pulsar-admin functions update \ + --function-config-file function-config.yaml + +``` + diff --git a/site2/docs/functions-deploy-cluster-resource.md b/site2/docs/functions-deploy-cluster-resource.md new file mode 100644 index 0000000000000..0c55066b15132 --- /dev/null +++ b/site2/docs/functions-deploy-cluster-resource.md @@ -0,0 +1,34 @@ +--- +id: functions-deploy-cluster-resource +title: Allocate resources to function instance +sidebar_label: "Allocate resources to function instance" +--- + +When running functions in cluster mode, you can specify the resources that can be allocated to each function instance. + +The following table outlines the resources that can be allocated to function instances. + +| Resource | Specified as | Supported runtime | +|------------|---------------------|-------------------| +| CPU | The number of cores | Kubernetes | +| RAM | The number of bytes | Kubernetes | +| Disk space | The number of bytes | Kubernetes | + +For example, the following command allocates 8 cores, 8GB of RAM, and 10GB of disk space to a function. + +```bash + +bin/pulsar-admin functions create \ + --jar target/my-functions.jar \ + --classname org.example.functions.MyFunction \ + --cpu 8 \ + --ram 8589934592 \ + --disk 10737418240 + +``` + +:::note + +The resources allocated to a given function are applied to each instance of the function. For example, if you apply 8GB of RAM to a function with a [parallelism](functions-deploy-cluster-parallelism.md) of 5, you are applying 40GB of RAM for the function in total. + +::: diff --git a/site2/docs/functions-deploy-cluster.md b/site2/docs/functions-deploy-cluster.md new file mode 100644 index 0000000000000..b7476cc31d97b --- /dev/null +++ b/site2/docs/functions-deploy-cluster.md @@ -0,0 +1,38 @@ +--- +id: functions-deploy-cluster +title: Deploy a function in cluster mode +sidebar_label: "Deploy a function in cluster mode" +--- + +Deploying a function in cluster mode uploads the function to a function worker, which means the function is scheduled by the worker. + +To deploy a function in cluster mode, use the `create` command. + +```bash + +bin/pulsar-admin functions create \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +To update a function running in cluster mode, you can use the `update` command. + +```bash + +bin/pulsar-admin functions update \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/new-input-topic \ + --output persistent://public/default/new-output-topic + +``` + +**More options** +* [Allocate resources to function instance](functions-deploy-cluster-resource.md) +* [Enable parallel processing](functions-deploy-cluster-parallelism.md) +* [Enable end-to-end encryption](functions-deploy-cluster-encryption.md) +* [Enable package management service](functions-deploy-cluster-package.md) + diff --git a/site2/docs/functions-deploy-localrun.md b/site2/docs/functions-deploy-localrun.md new file mode 100644 index 0000000000000..7ba87867af421 --- /dev/null +++ b/site2/docs/functions-deploy-localrun.md @@ -0,0 +1,37 @@ +--- +id: functions-deploy-localrun +title: Deploy a function in localrun mode +sidebar_label: "Deploy a function in localrun mode" +--- + +When you deploy a function in localrun mode, it runs on the machine where you enter the commands – on your laptop, for example, or in an [AWS EC2](https://aws.amazon.com/ec2/) instance. + +You can use the `localrun` command to run a single instance of a function. To run multiple instances, you can use the `localrun` command multiple times. + +The following is an example of how to use the `localrun` command. + +```bash + +bin/pulsar-admin functions localrun \ + --py myfunc.py \ + --classname myfunc.SomeFunction \ + --inputs persistent://public/default/input-1 \ + --output persistent://public/default/output-1 + +``` + +:::note + +In localrun mode, Java functions use thread runtime; Python and Go functions use process runtime. + +::: + +By default, the function connects with a Pulsar cluster running on the same machine via a local broker service URL. If you want to connect it to a non-local Pulsar cluster, you can specify a different broker service URL using the `--brokerServiceUrl` flag. + +```bash + +bin/pulsar-admin functions localrun \ + --broker-service-url pulsar://my-cluster-host:6650 \ + # Other function parameters + +``` diff --git a/site2/docs/functions-deploy-trigger.md b/site2/docs/functions-deploy-trigger.md new file mode 100644 index 0000000000000..790f528986315 --- /dev/null +++ b/site2/docs/functions-deploy-trigger.md @@ -0,0 +1,76 @@ +--- +id: functions-deploy-trigger +title: Trigger a function +sidebar_label: "Trigger a function" +--- + +Triggering a function means that you invoke a function by producing a message to one of the input topics via the CLI. You can use the `trigger` command to trigger a function at any time. + +:::tip + +With the [`pulsar-admin`](/tools/pulsar-admin/) CLI, you can send messages to functions without using the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. + +::: + +To learn how to trigger a function, you can start with a Python function that returns a simple string based on the input as follows. + +```python + +# myfunc.py +def process(input): + return "This function has been triggered with a value of {0}".format(input) + +``` + +1. Run the function in cluster mode. + + ```bash + + bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name myfunc \ + --py myfunc.py \ + --classname myfunc \ + --inputs persistent://public/default/in \ + --output persistent://public/default/out + + ``` + +2. Assign a consumer to listen on the output topic for messages from the `myfunc` function with the [`pulsar-client consume`](reference-cli-tools.md#consume) command. + + ```bash + + bin/pulsar-client consume persistent://public/default/out \ + --subscription-name my-subscription \ + --num-messages 0 # Listen indefinitely + + ``` + +3. Trigger the function. + + ```bash + + bin/pulsar-admin functions trigger \ + --tenant public \ + --namespace default \ + --name myfunc I am running a few minutes late; my previous meeting is running over. + --trigger-value "hello world" + + ``` + + :::tip + + In the `trigger` command, topic info is not required. You only need to specify basic information about the function, such as tenant, namespace, and function name. + + ::: + +The consumer listening on the output topic produces something as follows in the log. + +```text + +----- got message ----- +This function has been triggered with a value of hello world + +``` + diff --git a/site2/docs/functions-deploy.md b/site2/docs/functions-deploy.md index 66895225d84c6..19c4a91ccb5e3 100644 --- a/site2/docs/functions-deploy.md +++ b/site2/docs/functions-deploy.md @@ -1,261 +1,21 @@ --- id: functions-deploy title: Deploy Pulsar Functions -sidebar_label: "How-to: Deploy" +sidebar_label: "How to deploy" --- -## Requirements +Pulsar provides two modes to deploy a function: +* [cluster mode (for production)](functions-deploy-cluster.md) - you can submit a function to a Pulsar cluster and the cluster will take charge of running the function. +* [localrun mode](functions-deploy-localrun.md) - you can determine where a function runs, for example, on your local machine. -To deploy and manage Pulsar Functions, you need to have a Pulsar cluster running. There are several options for this: +## Prerequisites -* You can run a [standalone cluster](getting-started-standalone) locally on your own machine. -* You can deploy a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal), DC/OS, and more. +Before deploying a function, you need to have a Pulsar cluster running first. You have the following options: +* Run a [standalone cluster](getting-started-standalone.md) locally on your own machine. +* Run a Pulsar cluster on [Kubernetes](deploy-kubernetes.md), [Amazon Web Services](deploy-aws.md), [bare metal](deploy-bare-metal.md), and so on. -If you run a non-[standalone](reference-terminology.md#standalone) cluster, you need to obtain the service URL for the cluster. How you obtain the service URL depends on how you deploy your Pulsar cluster. +:::note -If you want to deploy and trigger Python user-defined functions, you need to install [the pulsar python client](/docs/en/client-libraries-python/) on all the machines running [functions workers](functions-worker). +If you want to deploy user-defined functions in Python, you need to install the [python client](client-libraries-python.md) on all the machines running [function workers](functions-concepts.md#function-worker). -## Command-line interface - -Pulsar Functions are deployed and managed using the [`pulsar-admin functions`](/tools/pulsar-admin/) interface, which contains commands such as [`create`](/tools/pulsar-admin/) for deploying functions in [cluster mode](#cluster-mode), [`trigger`](/tools/pulsar-admin/) for [triggering](#triggering-pulsar-functions) functions, [`list`](/tools/pulsar-admin/) for listing deployed functions. - -To learn more commands, refer to [`pulsar-admin functions`](/tools/pulsar-admin/). - -### Default arguments - -When managing Pulsar Functions, you need to specify a variety of information about functions, including tenant, namespace, input and output topics, and so on. However, some parameters have default values if you do not specify values for them. The following table lists the default values. - -Parameter | Default -:---------|:------- -Function name | You can specify any value for the class name (except org, library, or similar class names). For example, when you specify the flag `--classname org.example.MyFunction`, the function name is `MyFunction`. -Tenant | Derived from names of the input topics. If the input topics are under the `marketing` tenant, which means the topic names have the form `persistent://marketing/{namespace}/{topicName}`, the tenant is `marketing`. -Namespace | Derived from names of the input topics. If the input topics are under the `asia` namespace under the `marketing` tenant, which means the topic names have the form `persistent://marketing/asia/{topicName}`, then the namespace is `asia`. -Output topic | `{input topic}-{function name}-output`. For example, if an input topic name of a function is `incoming`, and the function name is `exclamation`, then the name of the output topic is `incoming-exclamation-output`. -Subscription type | For `at-least-once` and `at-most-once` [processing guarantees](functions-overview.md#processing-guarantees), the [`SHARED`](concepts-messaging.md#shared) mode is applied by default; for `effectively-once` guarantees, the [`FAILOVER`](concepts-messaging.md#failover) mode is applied. -Processing guarantees | [`ATLEAST_ONCE`](functions-overview.md#processing-guarantees) -Pulsar service URL | `pulsar://localhost:6650` - -### Example of default arguments - -Take the `create` command as an example. - -```bash - -$ bin/pulsar-admin functions create \ - --jar my-pulsar-functions.jar \ - --classname org.example.MyFunction \ - --inputs my-function-input-topic1,my-function-input-topic2 - -``` - -The function has default values for the function name (`MyFunction`), tenant (`public`), namespace (`default`), subscription type (`SHARED`), processing guarantees (`ATLEAST_ONCE`), and Pulsar service URL (`pulsar://localhost:6650`). - -## Local run mode - -If you run a Pulsar Function in **local run** mode, it runs on the machine from which you enter the commands (on your laptop, an [AWS EC2](https://aws.amazon.com/ec2/) instance, and so on). The following is a [`localrun`](/tools/pulsar-admin/) command example. - -```bash - -$ bin/pulsar-admin functions localrun \ - --py myfunc.py \ - --classname myfunc.SomeFunction \ - --inputs persistent://public/default/input-1 \ - --output persistent://public/default/output-1 - -``` - -By default, the function connects to a Pulsar cluster running on the same machine, via a local [broker](reference-terminology.md#broker) service URL of `pulsar://localhost:6650`. If you use local run mode to run a function but connect it to a non-local Pulsar cluster, you can specify a different broker URL using the `--brokerServiceUrl` flag. The following is an example. - -```bash - -$ bin/pulsar-admin functions localrun \ - --broker-service-url pulsar://my-cluster-host:6650 \ - # Other function parameters - -``` - -## Cluster mode - -When you run a Pulsar Function in **cluster** mode, the function code is uploaded to a Pulsar broker and runs *alongside the broker* rather than in your [local environment](#local-run-mode). You can run a function in cluster mode using the [`create`](/tools/pulsar-admin/) command. - -```bash - -$ bin/pulsar-admin functions create \ - --py myfunc.py \ - --classname myfunc.SomeFunction \ - --inputs persistent://public/default/input-1 \ - --output persistent://public/default/output-1 - -``` - -### Update functions in cluster mode - -You can use the [`update`](/tools/pulsar-admin/) command to update a Pulsar Function running in cluster mode. The following command updates the function created in the [cluster mode](#cluster-mode) section. - -```bash - -$ bin/pulsar-admin functions update \ - --py myfunc.py \ - --classname myfunc.SomeFunction \ - --inputs persistent://public/default/new-input-topic \ - --output persistent://public/default/new-output-topic - -``` - -### Parallelism - -Pulsar Functions run as processes or threads, which are called **instances**. When you run a Pulsar Function, it runs as a single instance by default. With one localrun command, you can only run a single instance of a function. If you want to run multiple instances, you can use localrun command multiple times. - -When you create a function, you can specify the *parallelism* of a function (the number of instances to run). You can set the parallelism factor using the `--parallelism` flag of the [`create`](r/tools/pulsar-admin/) command. - -```bash - -$ bin/pulsar-admin functions create \ - --parallelism 3 \ - # Other function info - -``` - -You can adjust the parallelism of an already created function using the [`update`](/tools/pulsar-admin/) interface. - -```bash - -$ bin/pulsar-admin functions update \ - --parallelism 5 \ - # Other function - -``` - -If you specify a function configuration via YAML, use the `parallelism` parameter. The following is a config file example. - -```yaml - -# function-config.yaml -parallelism: 3 -inputs: -- persistent://public/default/input-1 -output: persistent://public/default/output-1 -# other parameters - -``` - -The following is corresponding update command. - -```bash - -$ bin/pulsar-admin functions update \ - --function-config-file function-config.yaml - -``` - -### Function instance resources - -When you run Pulsar Functions in [cluster mode](#cluster-mode), you can specify the resources that are assigned to each function [instance](#parallelism). - -Resource | Specified as | Runtimes -:--------|:----------------|:-------- -CPU | The number of cores | Kubernetes -RAM | The number of bytes | Process, Docker -Disk space | The number of bytes | Docker - -The following function creation command allocates 8 cores, 8 GB of RAM, and 10 GB of disk space to a function. - -```bash - -$ bin/pulsar-admin functions create \ - --jar target/my-functions.jar \ - --classname org.example.functions.MyFunction \ - --cpu 8 \ - --ram 8589934592 \ - --disk 10737418240 - -``` - -> #### Resources are *per instance* -> The resources that you apply to a given Pulsar Function are applied to each instance of the function. For example, if you apply 8 GB of RAM to a function with a parallelism of 5, you are applying 40 GB of RAM for the function in total. Make sure that you take the parallelism (the number of instances) factor into your resource calculations. - -### Use Package management service - -Package management enables version management and simplifies the upgrade and rollback processes for Functions, Sinks, and Sources. When you use the same function, sink and source in different namespaces, you can upload them to a common package management system. - -To use [Package management service](admin-api-packages), ensure that the package management service has been enabled in your cluster by setting the following properties in `broker.conf`. - -> Note: Package management service is not enabled by default. - -```yaml - -enablePackagesManagement=true -packagesManagementStorageProvider=org.apache.pulsar.packages.management.storage.bookkeeper.BookKeeperPackagesStorageProvider -packagesReplicas=1 -packagesManagementLedgerRootPath=/ledgers - -``` - -With Package management service enabled, you can upload your function packages by [upload a package](admin-api-packages.md#upload-a-package) to the service and get the [package URL](admin-api-packages.md#package-url). - -When you have a ready to use package URL, you can create the function with package URL by setting `--jar`, `--py`, or `--go` to the package URL with `pulsar-admin functions create`. - -## Trigger Pulsar Functions - -If a Pulsar Function is running in [cluster mode](#cluster-mode), you can **trigger** it at any time using the command line. Triggering a function means that you send a message with a specific value to the function and get the function output (if any) via the command line. - -> Triggering a function is to invoke a function by producing a message on one of the input topics. With the [`pulsar-admin functions trigger`](/tools/pulsar-admin/) command, you can send messages to functions without using the [`pulsar-client`](reference-cli-tools.md#pulsar-client) tool or a language-specific client library. - -To learn how to trigger a function, you can start with Python function that returns a simple string based on the input. - -```python - -# myfunc.py -def process(input): - return "This function has been triggered with a value of {0}".format(input) - -``` - -You can run the function in [local run mode](functions-deploy.md#local-run-mode). - -```bash - -$ bin/pulsar-admin functions create \ - --tenant public \ - --namespace default \ - --name myfunc \ - --py myfunc.py \ - --classname myfunc \ - --inputs persistent://public/default/in \ - --output persistent://public/default/out - -``` - -Then assign a consumer to listen on the output topic for messages from the `myfunc` function with the [`pulsar-client consume`](reference-cli-tools.md#consume) command. - -```bash - -$ bin/pulsar-client consume persistent://public/default/out \ - --subscription-name my-subscription - --num-messages 0 # Listen indefinitely - -``` - -And then you can trigger the function. - -```bash - -$ bin/pulsar-admin functions trigger \ - --tenant public \ - --namespace default \ - --name myfunc \ - --trigger-value "hello world" - -``` - -The consumer listening on the output topic produces something as follows in the log. - -``` - ------ got message ----- -This function has been triggered with a value of hello world - -``` - -> #### Topic info is not required -> In the `trigger` command, you only need to specify basic information about the function (tenant, namespace, and name). To trigger the function, you do not need to know the function input topics. +::: diff --git a/site2/docs/functions-develop-admin-api.md b/site2/docs/functions-develop-admin-api.md new file mode 100644 index 0000000000000..5d24db7c7f13e --- /dev/null +++ b/site2/docs/functions-develop-admin-api.md @@ -0,0 +1,53 @@ +--- +id: functions-develop-admin-api +title: Call Pulsar admin APIs +sidebar_label: "Call Pulsar admin APIs" +--- + +Pulsar Functions that use the Java SDK have access to the Pulsar admin client, which allows the Pulsar admin client to manage API calls to your Pulsar clusters. + +Below is an example of how to use the Pulsar admin client exposed from the function `context`. + +```java + +import org.apache.pulsar.client.admin.PulsarAdmin; +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +/** + * In this particular example, for every input message, + * the function resets the cursor of the current function's subscription to a + * specified timestamp. + */ +public class CursorManagementFunction implements Function { + + @Override + public String process(String input, Context context) throws Exception { + PulsarAdmin adminClient = context.getPulsarAdmin(); + if (adminClient != null) { + String topic = context.getCurrentRecord().getTopicName().isPresent() ? + context.getCurrentRecord().getTopicName().get() : null; + String subName = context.getTenant() + "/" + context.getNamespace() + "/" + context.getFunctionName(); + if (topic != null) { + // 1578188166 below is a random-pick timestamp + adminClient.topics().resetCursor(topic, subName, 1578188166); + return "reset cursor successfully"; + } + } + return null; + } +} + +``` + +To enable your function to get access to the Pulsar admin client, you need to set `exposeAdminClientEnabled=true` in the `conf/functions_worker.yml` file. To test whether it is enabled or not, you can use the command `pulsar-admin functions localrun` with the flag `--web-service-url` as follows. + +```bash + +bin/pulsar-admin functions localrun \ + --jar my-functions.jar \ + --classname my.package.CursorManagementFunction \ + --web-service-url http://pulsar-web-service:8080 \ + # Other function configs + +``` diff --git a/site2/docs/functions-develop-api.md b/site2/docs/functions-develop-api.md new file mode 100644 index 0000000000000..baa21f5fbe652 --- /dev/null +++ b/site2/docs/functions-develop-api.md @@ -0,0 +1,214 @@ +--- +id: functions-develop-api +title: Use APIs +sidebar_label: "Use APIs" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +The following table outlines the APIs that you can use to develop Pulsar Functions in Java, Python, and Go. + +| Interface | Description | Use case| +|---------|------------|---------| +| [Language-native interface for Java/Python](#use-language-native-interface-for-javapython) | No Pulsar-specific libraries or special dependencies required (only core libraries). | Functions that do not require access to the [context](functions-concepts.md#context).| +| [Pulsar Functions SDK for Java/Python/Go](#use-sdk-for-javapythongo) | Pulsar-specific libraries that provide a range of functionality not available in the language-native interfaces, such as state management or user configuration. | Functions that require access to the [context](functions-concepts.md#context).| +| [Extended Pulsar Functions SDK for Java](#use-extended-sdk-for-java) | An extension to Pulsar-specific libraries, providing the initialization and close interfaces in Java. | Functions that require initializing and releasing external resources.| + + +## Use language-native interface for Java/Python + +The language-native interface provides a simple and clean approach to write Java/Python functions, by adding an exclamation point to all incoming strings and publishing the output string to a topic. It has no external dependencies. + +The following examples are language-native functions. + +````mdx-code-block + + + +To use a piece of Java code as a “language-native” function, you need to implement the `java.util.Function` interface. You can include any sort of complex logic inside the `apply` method to provide more processing capabilities. + +```java + +import java.util.function.Function; + +public class JavaNativeExclamationFunction implements Function { + @Override + public String apply(String input) { + return String.format("%s!", input); + } +} + +``` + +For more details, see [code example](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). + + + + +To use a piece of Python code as a “language-native” function, you must have a method named `process` as follows. It appends an exclamation point to any string value it receives. + +```python + +def process(input): + return "{}!".format(input) + +``` + +For more details, see [code example](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). + +:::note + +Write Pulsar Functions in Python 3. To make sure your functions can run, you need to have Python 3 installed for functions workers and set Python 3 as the default interpreter. + +::: + + + +```` + + +## Use SDK for Java/Python/Go + +The implementation of Pulsar Functions SDK specifies a functional interface that includes the [context](functions-concepts.md#context) object as a parameter. + +The following examples use Pulsar Functions SDK for different languages. + +````mdx-code-block + + + +When developing a function using the Java SDK, you need to implement the `org.apache.pulsar.functions.api.Function` interface. It specifies only one method that you need to implement called `process`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class ExclamationFunction implements Function { + @Override + public String process(String input, Context context) { + return String.format("%s!", input); + } +} + +``` + +For more details, see [code example](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). + + + + +To develop a function using the Python SDK, you need to add the pulsar client dependency to your Python installation. + +```python + +from pulsar import Function + +class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + +``` + +For more details, see [code example](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). + + + + +To develop a function using the Go SDK, you need to add the pulsar client dependency to your Go installation and provide the name of the function to the `pf.Start()` method inside the `main()` method. This registers the function with the Pulsar Functions framework and ensures that the specified function can be invoked when a new message arrives. + +```go + +package main + +import ( + "context" + "fmt" + + "github.com/apache/pulsar/pulsar-function-go/pf" +) + +func HandleRequest(ctx context.Context, in []byte) error{ + fmt.Println(string(in) + "!") + return nil +} + +func main() { + pf.Start(HandleRequest) +} + +``` + +For more details, see [code example](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/inputFunc/inputFunc.go#L20-L36). + + + +```` + + +## Use extended SDK for Java + +This extended Pulsar Functions SDK provides two additional interfaces to initialize and release external resources. +- By using the `initialize` interface, you can initialize external resources which only need one-time initialization when the function instance starts. +- By using the `close` interface, you can close the referenced external resources when the function instance closes. + +:::note + +The extended Pulsar Functions SDK for Java is only available in Pulsar 2.10.0 or later versions. Before using it, you need to [set up function workers](functions-worker.md) in Pulsar 2.10.0 or later versions. + +::: + +The following example uses the extended interface of Pulsar Functions SDK for Java to initialize RedisClient when the function instance starts and release it when the function instance closes. + +````mdx-code-block + + + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import io.lettuce.core.RedisClient; + +public class InitializableFunction implements Function { + private RedisClient redisClient; + + private void initRedisClient(Map connectInfo) { + redisClient = RedisClient.create(connectInfo.get("redisURI")); + } + + @Override + public void initialize(Context context) { + Map connectInfo = context.getUserConfigMap(); + redisClient = initRedisClient(connectInfo); + } + + @Override + public String process(String input, Context context) { + String value = client.get(key); + return String.format("%s-%s", input, value); + } + + @Override + public void close() { + redisClient.close(); + } +} + +``` + + + +```` diff --git a/site2/docs/functions-develop-log.md b/site2/docs/functions-develop-log.md new file mode 100644 index 0000000000000..9ae2c2745eb6d --- /dev/null +++ b/site2/docs/functions-develop-log.md @@ -0,0 +1,193 @@ +--- +id: functions-develop-log +title: Produce function logs +sidebar_label: "Produce function logs" +--- + +## Produce logs for Java functions + +Pulsar Functions that use the Java SDK have access to an [SLF4j `Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object. The logger object can be used to produce logs at a specified log level. + +For example, the following function logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class LoggingFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + String messageId = new String(context.getMessageId()); + + if (input.contains("danger")) { + LOG.warn("A warning was received in message {}", messageId); + } else { + LOG.info("Message {} received\nContent: {}", messageId, input); + } + + return null; + } +} + +``` + +To enable your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash + +bin/pulsar-admin functions create \ + --jar my-functions.jar \ + --classname my.package.LoggingFunction \ + --log-topic persistent://public/default/logging-function-logs \ + # Other function configs + +``` + +You can access all the logs produced by `LoggingFunction` via the `persistent://public/default/logging-function-logs` topic. + +### Customize log levels for Java functions + +By default, the log level for Java functions is `info`. If you want to customize the log level of your Java functions, for example, change it to `debug`, you can update the [`functions_log4j2.xml`](https://github.com/apache/pulsar/blob/master/conf/functions_log4j2.xml) file. + +:::tip + +The `functions_log4j2.xml` file is under your Pulsar configuration directory, for example, `/etc/pulsar/` on bare-metal, or `/pulsar/conf` on Kubernetes. + +::: + +1. Set the value of `property`. + + ```xml + + + pulsar.log.level + debug + + + ``` + +2. Apply the log level to places where they are referenced. In the following example, `debug` applies to all function logs. + + ```xml + + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + + + ``` + + To be more selective, you can apply different log levels to different classes or modules. For example: + + ```xml + + + com.example.module + info + false + + ${sys:pulsar.log.appender} + + + + ``` + + To apply a more verbose log level to a class in the module, you can reference the following example: + + ```xml + + + com.example.module.className + debug + false + + Console + + + + ``` + + * `additivity` indicates whether log messages will be duplicated if multiple `` entries overlap. Disabling additivity (`false`) prevents duplication of log messages when one or more `` entries contain classes or modules that overlap. + * `AppenderRef` allows you to output the log to a target specified in the definition of the `Appender` section. For example: + + ```xml + + + Console + SYSTEM_OUT + + %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n + + + + ``` + +## Produce logs for Python functions + +Pulsar Functions that use the Python SDK have access to a logger object. The logger object can be used to produce logs at a specified log level. + +For example, the following function logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. + +```python + +from pulsar import Function + +class LoggingFunction(Function): + def process(self, input, context): + logger = context.get_logger() + msg_id = context.get_message_id() + if 'danger' in input: + logger.warn("A warning was received in message {0}".format(context.get_message_id())) + else: + logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) + +``` + +To enable your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. + +```bash + +bin/pulsar-admin functions create \ + --py logging_function.py \ + --classname logging_function.LoggingFunction \ + --log-topic logging-function-logs \ + # Other function configs + +``` + +All logs produced by `LoggingFunction` can be accessed via the `logging-function-logs` topic. Additionally, you can specify the function log levels through `context.get_logger().setLevel(level)`. For more information, refer to [Logging facility for Python](https://docs.python.org/3/library/logging.html#logging.Logger.setLevel) . + +## Produce logs for Go functions + +When you use `logTopic` related functionalities in Go functions, you can import `github.com/apache/pulsar/pulsar-function-go/logutil` rather than using the `getLogger()` context object. + +The following function shows different log levels based on the function input. + +```go +import ( + "context" + + "github.com/apache/pulsar/pulsar-function-go/pf" + + log "github.com/apache/pulsar/pulsar-function-go/logutil" +) + +func loggerFunc(ctx context.Context, input []byte) { + if len(input) <= 100 { + log.Infof("This input has a length of: %d", len(input)) + } else { + log.Warnf("This input is getting too long! It has {%d} characters", len(input)) + } +} + +func main() { + pf.Start(loggerFunc) +} + +``` diff --git a/site2/docs/functions-develop-metrics.md b/site2/docs/functions-develop-metrics.md new file mode 100644 index 0000000000000..fbf31c124b28c --- /dev/null +++ b/site2/docs/functions-develop-metrics.md @@ -0,0 +1,95 @@ +--- +id: functions-develop-metrics +title: Use metrics to monitor functions +sidebar_label: "Use metrics to monitor functions" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +To ensure that running functions are healthy at any time, you can configure functions to publish arbitrary metrics to the `metrics` interface that can be queried. + +:::note + +Using the language-native interface for Java or Python is **not** able to publish metrics and stats to Pulsar. + +::: + +You can use both built-in metrics and customized metrics to monitor functions. +- Use the built-in [function metrics](reference-metrics.md#pulsar-functions). + Pulsar Functions expose the metrics that can be collected and used for monitoring the health of Java, Python, and Go functions. You can check the metrics by following the [monitoring](deploy-monitoring.md/#function-and-connector-stats) guide. +- Set your customized metrics. + In addition to the built-in metrics, Pulsar allows you to customize metrics for Java and Python functions. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. + +Here is an example of how to customize metrics for Java, Python and Go functions by using the [`Context object`](functions-concepts.md#context) on a per-key basis. For example, you can set a metric for the `process-count` key and set another one for the `elevens-count` key every time the function processes a message. + + +````mdx-code-block + + + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; + +public class MetricRecorderFunction implements Function { + @Override + public void apply(Integer input, Context context) { + // Records the metric 1 every time a message arrives + context.recordMetric("hit-count", 1); + + // Records the metric only if the arriving number equals 11 + if (input == 11) { + context.recordMetric("elevens-count", 1); + } + + return null; + } +} + +``` + + + + +```python + +from pulsar import Function + +class MetricRecorderFunction(Function): + def process(self, input, context): + context.record_metric('hit-count', 1) + + if input == 11: + context.record_metric('elevens-count', 1) + +``` + + + + +```go + +func metricRecorderFunction(ctx context.Context, in []byte) error { + inputstr := string(in) + fctx, ok := pf.FromContext(ctx) + if !ok { + return errors.New("get Go Functions Context error") + } + fctx.RecordMetric("hit-count", 1) + if inputstr == "eleven" { + fctx.RecordMetric("elevens-count", 1) + } + return nil +} + +``` + + + +```` diff --git a/site2/docs/functions-develop-schema-registry.md b/site2/docs/functions-develop-schema-registry.md new file mode 100644 index 0000000000000..dab0505c11340 --- /dev/null +++ b/site2/docs/functions-develop-schema-registry.md @@ -0,0 +1,9 @@ +--- +id: functions-develop-schema-registry +title: Use schema registry +sidebar_label: "Use schema registry" +--- + +Pulsar has a built-in schema registry and is bundled with popular schema types, such as Avro, JSON and Protobuf. Pulsar Functions can leverage the existing schema information from input topics and derive the input type. The schema registry applies to output topics as well. + +For more details, refer to [code example](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/AutoSchemaFunction.java). diff --git a/site2/docs/functions-develop-security.md b/site2/docs/functions-develop-security.md new file mode 100644 index 0000000000000..4b0eb19766c30 --- /dev/null +++ b/site2/docs/functions-develop-security.md @@ -0,0 +1,96 @@ +--- +id: functions-develop-security +title: Enable security on functions +sidebar_label: "Enable security on functions" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +## Prerequisites + +If you want to enable security on functions, you need to [enable security settings](functions-worker.md#enable-security-settings) on function workers first. + + +## Configure function workers + +To use the secret APIs from the context, you need to set the following two parameters for function workers. +* `secretsProviderConfiguratorClassName` +* `secretsProviderConfiguratorConfig` + +Pulsar Functions provided two types of `SecretsProviderConfigurator` implementation and both can be used as the value of `secretsProviderConfiguratorClassName` directly: +* `org.apache.pulsar.functions.secretsproviderconfigurator.DefaultSecretsProviderConfigurator`: This is a barebones version of a secrets provider which wires in `ClearTextSecretsProvider` to the function instances. +* `org.apache.pulsar.functions.secretsproviderconfigurator.KubernetesSecretsProviderConfigurator`: This is used by default for running in Kubernetes and it uses kubernetes built-in secrets and bind them as environment variables (via `EnvironmentBasedSecretsProvider`) within the function container to ensure that the secrets are available to the function at runtime. + +Function workers use the `org.apache.pulsar.functions.secretsproviderconfigurator.SecretsProviderConfigurator` interface to choose the `SecretsProvider` class name and its associated configurations at the time of starting the function instances. + +Function instances use the `org.apache.pulsar.functions.secretsprovider.SecretsProvider` interface to fetch the secrets. The implementation that `SecretsProvider` uses is determined by `SecretsProviderConfigurator`. + +You can also implemet your own `SecretsProviderConfigurator` if you want to use different `SecretsProvider` for function instances. + +:::note + +Currently, only Java and Python runtime support `SecretsProvider`. The Java and Python Runtime have the following two providers: +- ClearTextSecretsProvider (default for `DefaultSecretsProviderConfigurator`) +- EnvironmentBasedSecretsProvider (default for `KubernetesSecretsProviderConfigurator`) + +::: + +## Get the secret + +Once `SecretsProviderConfigurator` is set, you can get the secret using the [`Context`](functions-concepts.md#context) object as follows. + +````mdx-code-block + + + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +public class GetSecretValueFunction implements Function { + + @Override + public Void process(String input, Context context) throws Exception { + Logger LOG = context.getLogger(); + String secretValue = context.getSecret(input); + + if (!secretValue.isEmpty()) { + LOG.info("The secret {} has value {}", intput, secretValue); + } else { + LOG.warn("No secret with key {}", input); + } + + return null; + } +} + +``` + + + + +```python + +from pulsar import Function + +class GetSecretValueFunction(Function): + def process(self, input, context): + logger = context.get_logger() + secret_value = context.get_secret(input) + if secret_provider is None: + logger.warn('No secret with key {0} '.format(input)) + else: + logger.info("The secret {0} has value {1}".format(input, secret_value)) + +``` + + + +```` diff --git a/site2/docs/functions-develop-serde.md b/site2/docs/functions-develop-serde.md new file mode 100644 index 0000000000000..dd48a3f7272eb --- /dev/null +++ b/site2/docs/functions-develop-serde.md @@ -0,0 +1,160 @@ +--- +id: functions-develop-serde +title: Use SerDe +sidebar_label: "Use SerDe" +--- + +Pulsar Functions use SerDe (**Ser**ialization and **De**serialization) when publishing data to or consuming data from Pulsar topics. How SerDe works by default depends on the language you use (Java or Python) for a particular function. In both languages, however, you can write custom SerDe logic for more complex, application-specific types. + +## Use SerDe for Java functions + +The following basic Java types are built-in and supported by default for Java functions: `string`, `double`, `integer`, `float`, `long`, `short`, and `byte`. + +To customize Java types, you need to implement the following interface. + +```java + +public interface SerDe { + T deserialize(byte[] input); + byte[] serialize(T input); +} + +``` + +SerDe works in the following ways for Java functions. +- If the input and output topics have a schema, Pulsar Functions use the schema for SerDe. +- If the input or output topics do not exist, Pulsar Functions adopt the following rules to determine SerDe: + - If the schema type is specified, Pulsar Functions use the specified schema type. + - If SerDe is specified, Pulsar Functions use the specified SerDe, and the schema type for input and output topics is `byte`. + - If neither the schema type nor SerDe is specified, Pulsar Functions use the built-in SerDe. For non-primitive schema types, the built-in SerDe serializes and deserializes objects in the `JSON` format. + +For example, imagine that you're writing a function that processes tweet objects. You can refer to the following example of the `Tweet` class in Java. + +```java + +public class Tweet { + private String username; + private String tweetContent; + + public Tweet(String username, String tweetContent) { + this.username = username; + this.tweetContent = tweetContent; + } + + // Standard setters and getters +} + +``` + +To pass `Tweet` objects directly between functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings, and username and tweet content are separated by `|`. + +```java + +package com.example.serde; + +import org.apache.pulsar.functions.api.SerDe; + +import java.util.regex.Pattern; + +public class TweetSerde implements SerDe { + public Tweet deserialize(byte[] input) { + String s = new String(input); + String[] fields = s.split(Pattern.quote("|")); + return new Tweet(fields[0], fields[1]); + } + + public byte[] serialize(Tweet input) { + return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); + } +} + +``` + +To apply a customized SerDe to a particular function, you need to: +* Package the `Tweet` and `TweetSerde` classes into a JAR. +* Specify a path to the JAR and SerDe class name when deploying the function. + +The following is an example of using the `create` command to deploy a function by applying a customized SerDe. + +```bash + + bin/pulsar-admin functions create \ + --jar /path/to/your.jar \ + --output-serde-classname com.example.serde.TweetSerde \ + # Other function attributes + +``` + +:::note + +Custom SerDe classes must be packaged with your function JARs. + +::: + +## Use SerDe for Python functions + +In Python, the default SerDe is an identity, meaning that the type is serialized as whatever type the function returns. + +For example, you can specify the SerDe as follows when deploying a function in [cluster mode](functions-deploy-cluster.md). + +```bash + +bin/pulsar-admin functions create \ + --tenant public \ + --namespace default \ + --name my_function \ + --py my_function.py \ + --classname my_function.MyFunction \ + --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ + --output-serde-classname Serde3 \ + --output output-topic-1 + +``` + +This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the mapping must be specified as a JSON string). The output topic `output-topic-1` uses the `Serde3` class for SerDe. + +:::note + +All function related logic, including processing and SerDe classes, must be contained within a single Python file. + +::: + +The table outlines three SerDe options for Python functions. + +| SerDe option | Description | Use case| +| ------------|-----------|-----------| +| `IdentitySerde` (default) | Use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. Creating or running a function without explicitly specifying SerDe means that this option is used. | When you work with simple types like strings, booleans, integers.| +| `PickleSerDe` | Use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. | When you work with complex, application-specific types and are comfortable with the "best-effort" approach of `pickle`.| +| `Custom SerDe` | Create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods:
* [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes.
* [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. | When you require explicit control over SerDe, potentially for performance or data compatibility purposes.| + +For example, imagine that you are writing a function that processes tweet objects. You can refer to the following example of the `Tweet` class in Python. + +```python + +class Tweet(object): + def __init__(self, username, tweet_content): + self.username = username + self.tweet_content = tweet_content + +``` + +To use this class in Pulsar Functions, you have two options: +* Specify `PickleSerDe`, which applies the `pickle` library for SerDe. +* Create your own SerDe class. The following is an example. + +```python + +from pulsar import SerDe + +class TweetSerDe(SerDe): + + def serialize(self, input): + return bytes("{0}|{1}".format(input.username, input.tweet_content)) + + def deserialize(self, input_bytes): + tweet_components = str(input_bytes).split('|') + return Tweet(tweet_components[0], tweet_componentsp[1]) + +``` + +For more details, see [code example](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). diff --git a/site2/docs/functions-develop-state.md b/site2/docs/functions-develop-state.md new file mode 100644 index 0000000000000..2441c73389edc --- /dev/null +++ b/site2/docs/functions-develop-state.md @@ -0,0 +1,384 @@ +--- +id: functions-develop-state +title: Configure state storage +sidebar_label: "Configure state storage" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + + +Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar integrates with BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store state for functions. For example, a `WordCount` function can store the state of its counters into BookKeeper table service via [State APIs](#call-state-apis). + +States are key-value pairs, where a key is a string and its value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual function, and shared between instances of that function. + +:::note + +State storage is **not** available for Go functions. + +::: + + +## Call state APIs + +Pulsar Functions expose APIs for mutating and accessing `state`. These APIs are available in the [Context](functions-concepts.md#context) object when you use [Java/Python SDK](functions-develop-api.md) to develop functions. + +The following table outlines the states that can be accessed within Java and Python functions. + +| State-related API | Java | Python | +|-----------------------------------------|----------------------------------------|----------------| +| [Increment counter](#increment-counter) | `incrCounter`
`incrCounterAsync` | `incr_counter` | +| [Retrieve counter](#retrieve-counter) | `getCounter`
`getCounterAsync` | `get_counter` | +| [Update state](#update-state) | `putState`
`putStateAsync` | `put_state` | +| [Retrieve state](#retrieve-state) | `getState`
`getStateAsync` | `get_state` | +| [Delete state](#delete-state) | `deleteState` | `del_counter` | + + +## Increment counter + +You can use `incrCounter` to increment the counter of a given `key` by the given `amount`. +If the `key` does not exist, a new key is created. + +````mdx-code-block + + + +```java + + /** + * Increment the builtin distributed counter referred by key + * @param key The name of the key + * @param amount The amount to be incremented + */ + void incrCounter(String key, long amount); + +``` + + + + +```python + + def incr_counter(self, key, amount): + """incr the counter of a given key in the managed state""" + +``` + + + +```` + +To asynchronously increment the counter, you can use `incrCounterAsync`. + +````mdx-code-block + + + +```java + + /** + * Increment the builtin distributed counter referred by key + * but dont wait for the completion of the increment operation + * + * @param key The name of the key + * @param amount The amount to be incremented + */ + CompletableFuture incrCounterAsync(String key, long amount); + +``` + + + +```` + +### Retrieve counter + +You can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. + +````mdx-code-block + + + +```java + + /** + * Retrieve the counter value for the key. + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + long getCounter(String key); + +``` + + + + +```python + + def get_counter(self, key): + """get the counter of a given key in the managed state""" + +``` + + + +```` + +To asynchronously retrieve the counter mutated by `incrCounterAsync`, you can use `getCounterAsync`. + +````mdx-code-block + + + +```java + + /** + * Retrieve the counter value for the key, but don't wait + * for the operation to be completed + * + * @param key name of the key + * @return the amount of the counter value for this key + */ + CompletableFuture getCounterAsync(String key); + +``` + + + +```` + +### Update state + +Besides the `counter` API, Pulsar also exposes a general key/value API for functions to store and update the state of a given `key`. + +````mdx-code-block + + + +```java + + /** + * Update the state value for the key. + * + * @param key name of the key + * @param value state value of the key + */ + void putState(String key, ByteBuffer value); + +``` + + + + +```python + + def put_state(self, key, value): + """update the value of a given key in the managed state""" + +``` + + + +```` + +To asynchronously update the state of a given `key`, you can use `putStateAsync`. + +````mdx-code-block + + + +```java + + /** + * Update the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @param value state value of the key + */ + CompletableFuture putStateAsync(String key, ByteBuffer value); + +``` + + + +```` + +### Retrieve state + +You can use `getState` to retrieve the state of a given `key`. + +````mdx-code-block + + + +```java + + /** + * Retrieve the state value for the key. + * + * @param key name of the key + * @return the state value for the key. + */ + ByteBuffer getState(String key); + +``` + + + + +```python + + def get_state(self, key): + """get the value of a given key in the managed state""" + +``` + + + +```` + +To asynchronously retrieve the state of a given `key`, you can use `getStateAsync`. + +````mdx-code-block + + + +```java + + /** + * Retrieve the state value for the key, but don't wait for the operation to be completed + * + * @param key name of the key + * @return the state value for the key. + */ + CompletableFuture getStateAsync(String key); + +``` + + + +```` + +### Delete state + +:::note + +Both counters and binary values share the same keyspace, so this API deletes either type. + +::: + +````mdx-code-block + + + +```java + + /** + * Delete the state value for the key. + * + * @param key name of the key + */ + void deleteState(String key); + +``` + + + +```` + + +## Query state via CLI + +Besides using the [State APIs](#call-state-apis) to store the state of functions in Pulsar's state storage and retrieve it back from the storage, you can use CLI commands to query the state of functions. + +```bash + +bin/pulsar-admin functions querystate \ + --tenant \ + --namespace \ + --name \ + --state-storage-url \ + --key \ + [---watch] + +``` + +If `--watch` is specified, the CLI tool keeps running to get the latest value of the provided `state-key`. + + +## Example + +The example of `WordCountFunction` demonstrates how `state` is stored within Pulsar Functions. + +````mdx-code-block + + + + +The logic of {@inject: github:`WordCountFunction`:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is simple and straightforward: + +1. The function splits the received `String` into multiple words using regex `\\.`. +2. For each `word`, the function increments `counter` by 1 via `incrCounter(key, amount)`. + + ```java + + import org.apache.pulsar.functions.api.Context; + import org.apache.pulsar.functions.api.Function; + + import java.util.Arrays; + + public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); + return null; + } + } + + ``` + + + + +The logic of this `WordCount` function is simple and straightforward: + +1. The function first splits the received string into multiple words. +2. For each `word`, the function increments `counter` by 1 via `incr_counter(key, amount)`. + + ```python + + from pulsar import Function + + class WordCount(Function): + def process(self, item, context): + for word in item.split(): + context.incr_counter(word, 1) + + ``` + + + +```` diff --git a/site2/docs/functions-develop-tutorial.md b/site2/docs/functions-develop-tutorial.md new file mode 100644 index 0000000000000..98f33b9f8d877 --- /dev/null +++ b/site2/docs/functions-develop-tutorial.md @@ -0,0 +1,120 @@ +--- +id: functions-develop-tutorial +title: Tutorials +sidebar_label: "Tutorials" +--- + +## Write a function for word count + +:::note + +The following example is a stateful function. By default, the state of a function is disabled. See [Enable stateful functions](functions-worker-stateful.md) for more instructions. + +::: + +1. Write the function in Java using the [SDK for Java](functions-develop-api.md). + + ```java + + package org.example.functions; + + import org.apache.pulsar.functions.api.Context; + import org.apache.pulsar.functions.api.Function; + + import java.util.Arrays; + + public class WordCountFunction implements Function { + // This function is invoked every time a message is published to the input topic + @Override + public Void process(String input, Context context) throws Exception { + Arrays.asList(input.split(" ")).forEach(word -> { + String counterKey = word.toLowerCase(); + context.incrCounter(counterKey, 1); + }); + return null; + } + } + + ``` + +2. Bundle and build the JAR file, and then deploy it in your Pulsar cluster using the `pulsar-admin` command. + + ```bash + + bin/pulsar-admin functions create \ + --jar target/my-jar-with-dependencies.jar \ + --classname org.example.functions.WordCountFunction \ + --tenant public \ + --namespace default \ + --name word-count \ + --inputs persistent://public/default/sentences \ + --output persistent://public/default/count + ``` + +## Write a function for content-based routing + +1. Write the function in Python using the [SDK for Python](functions-develop-api.md). + + ```python + + from pulsar import Function + + class RoutingFunction(Function): + def __init__(self): + self.fruits_topic = "persistent://public/default/fruits" + self.vegetables_topic = "persistent://public/default/vegetables" + + def is_fruit(item): + return item in [b"apple", b"orange", b"pear", b"other fruits..."] + + def is_vegetable(item): + return item in [b"carrot", b"lettuce", b"radish", b"other vegetables..."] + + def process(self, item, context): + if self.is_fruit(item): + context.publish(self.fruits_topic, item) + elif self.is_vegetable(item): + context.publish(self.vegetables_topic, item) + else: + warning = "The item {0} is neither a fruit nor a vegetable".format(item) + context.get_logger().warn(warning) + + ``` + +2. Suppose this code is stored in `~/router.py`, then you can deploy it in your Pulsar cluster using the `pulsar-admin` command. + + ```bash + + bin/pulsar-admin functions create \ + --py ~/router.py \ + --classname router.RoutingFunction \ + --tenant public \ + --namespace default \ + --name route-fruit-veg \ + --inputs persistent://public/default/basket-items + + ``` + +## Write a window function for word count + +:::note + +Currently, window function is only available in Java. + +::: + +This example demonstrates how to use the [language-native interface](functions-develop-api.md) to write a window function in Java. + +Each input message is a sentence that is split into words and each word counted. The built-in counter state is used to keep track of the word count in a persistent and consistent manner. + +```java + +public class WordCountFunction implements Function { + @Override + public Void process(String input, Context context) { + Arrays.asList(input.split("\\s+")).forEach(word -> context.incrCounter(word, 1)); + return null; + } +} + +``` diff --git a/site2/docs/functions-develop-user-defined-configs.md b/site2/docs/functions-develop-user-defined-configs.md new file mode 100644 index 0000000000000..928aa0f9edb2f --- /dev/null +++ b/site2/docs/functions-develop-user-defined-configs.md @@ -0,0 +1,162 @@ +--- +id: functions-develop-user-defined-configs +title: Pass user-defined configurations +sidebar_label: "Pass user-defined configurations" +--- + +````mdx-code-block +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +```` + +When you run or update functions created via SDK, you can pass arbitrary key/value pairs to them by using CLI with the `--user-config` flag. Key/value pairs must be specified as JSON. + +````mdx-code-block + + + +:::note + +For all key/value pairs passed to Java functions, both keys and values are `string`. To set the value to be a different type, you need to deserialize it from the `string` type. + +::: + +The context object of Java SDK enables you to access key/value pairs provided to Pulsar Functions via CLI (as JSON). The following example passes a key/value pair. + +```bash + +bin/pulsar-admin functions create \ + # Other function configs + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Java function: + +```java + +import org.apache.pulsar.functions.api.Context; +import org.apache.pulsar.functions.api.Function; +import org.slf4j.Logger; + +import java.util.Optional; + +public class UserConfigFunction implements Function { + @Override + public void apply(String input, Context context) { + Logger LOG = context.getLogger(); + Optional wotd = context.getUserConfigValue("word-of-the-day"); + if (wotd.isPresent()) { + LOG.info("The word of the day is {}", wotd); + } else { + LOG.warn("No word of the day provided"); + } + return null; + } +} + +``` + +The `UserConfigFunction` function logs the string `"The word of the day is verdure"` every time the function is invoked. The `word-of-the-day` config can be changed only when the function is updated with a new value via the CLI. + +You can also access the entire user config map or set a default value in case no value is present. + +```java + +// Get the whole config map +Map allConfigs = context.getUserConfigMap(); + +// Get value or resort to default +String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); + +``` + + + + +In a Python function, you can access the configuration value like this. + +```python + +from pulsar import Function + +class WordFilter(Function): + def process(self, context, input): + forbidden_word = context.user_config()["forbidden-word"] + + # Don't publish the message if it contains the user-supplied + # forbidden word + if forbidden_word in input: + pass + # Otherwise publish the message + else: + return input + +``` + +The context object of Python SDK enables you to access key/value pairs provided to functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +bin/pulsar-admin functions create \ + # Other function configs \ + --user-config '{"word-of-the-day":"verdure"}' + +``` + +To access that value in a Python function: + +```python + +from pulsar import Function + +class UserConfigFunction(Function): + def process(self, input, context): + logger = context.get_logger() + wotd = context.get_user_config_value('word-of-the-day') + if wotd is None: + logger.warn('No word of the day provided') + else: + logger.info("The word of the day is {0}".format(wotd)) + +``` + + + + +The context object of Go SDK enables you to access key/value pairs provided to functions via the command line (as JSON). The following example passes a key/value pair. + +```bash + +bin/pulsar-admin functions create \ + --go path/to/go/binary + --user-config '{"word-of-the-day":"lackadaisical"}' + +``` + +To access that value in a Go function: + +```go + +func contextFunc(ctx context.Context) { + fc, ok := pf.FromContext(ctx) + if !ok { + logutil.Fatal("Function context is not defined") + } + + wotd := fc.GetUserConfValue("word-of-the-day") + + if wotd == nil { + logutil.Warn("The word of the day is empty") + } else { + logutil.Infof("The word of the day is %s", wotd.(string)) + } +} + +``` + + + +```` diff --git a/site2/docs/functions-develop.md b/site2/docs/functions-develop.md index 2bd53092bb853..5ffb54032baee 100644 --- a/site2/docs/functions-develop.md +++ b/site2/docs/functions-develop.md @@ -1,1677 +1,24 @@ --- id: functions-develop title: Develop Pulsar Functions -sidebar_label: "How-to: Develop" +sidebar_label: "How to develop" --- -````mdx-code-block -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -```` - - -You learn how to develop Pulsar Functions with different APIs for Java, Python and Go. - -## Available APIs -In Java and Python, you have two options to write Pulsar Functions. In Go, you can use Pulsar Functions SDK for Go. - -Interface | Description | Use cases -:---------|:------------|:--------- -Language-native interface | No Pulsar-specific libraries or special dependencies required (only core libraries from Java/Python). | Functions that do not require access to the function [context](#context). -Pulsar Function SDK for Java/Python/Go | Pulsar-specific libraries that provide a range of functionality not provided by "native" interfaces. | Functions that require access to the function [context](#context). -Extended Pulsar Function SDK for Java | An extension to Pulsar-specific libraries, providing the initialization and close interfaces in Java. | Functions that require initializing and releasing external resources. - -### Language-native interface -The language-native function, which adds an exclamation point to all incoming strings and publishes the resulting string to a topic, has no external dependencies. The following example is language-native function. - -````mdx-code-block - - - -```java - -import java.util.function.Function; - -public class JavaNativeExclamationFunction implements Function { - @Override - public String apply(String input) { - return String.format("%s!", input); - } -} - -``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/JavaNativeExclamationFunction.java). - - - - -```python - -def process(input): - return "{}!".format(input) - -``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/native_exclamation_function.py). - :::note -You can write Pulsar Functions in python2 or python3. However, Pulsar only looks for `python` as the interpreter. -If you're running Pulsar Functions on an Ubuntu system that only supports python3, you might fail to -start the functions. In this case, you can create a symlink. Your system will fail if -you subsequently install any other package that depends on Python 2.x. A solution is under development in [Issue 5518](https://github.com/apache/pulsar/issues/5518). - -```bash - -sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 - -``` +Develop functions in Java, Python, or Go. +* For supported Java versions, refer to [Pulsar runtime Java version recommendation](https://github.com/apache/pulsar#pulsar-runtime-java-version-recommendation) for more details. +* For supported Python versions, refer to [Python client](client-libraries-python.md#optional-dependencies) for more details. ::: - - - -```` - -### Pulsar Function SDK for Java/Python/Go -The following example uses Pulsar Functions SDK. -````mdx-code-block - - - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; - -public class ExclamationFunction implements Function { - @Override - public String process(String input, Context context) { - return String.format("%s!", input); - } -} - -``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/ExclamationFunction.java). - - - - -```python - -from pulsar import Function - -class ExclamationFunction(Function): - def __init__(self): - pass - - def process(self, input, context): - return input + '!' - -``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/exclamation_function.py). - - - - -```go - -package main - -import ( - "context" - "fmt" - - "github.com/apache/pulsar/pulsar-function-go/pf" -) - -func HandleRequest(ctx context.Context, in []byte) error{ - fmt.Println(string(in) + "!") - return nil -} - -func main() { - pf.Start(HandleRequest) -} - -``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/inputFunc/inputFunc.go#L20-L36). - - - - -```` - -### Extended Pulsar Function SDK for Java -This extended Pulsar Function SDK provides two additional interfaces to initialize and release external resources. -- By using the `initialize` interface, you can initialize external resources which only need one-time initialization when the function instance starts. -- By using the `close` interface, you can close the referenced external resources when the function instance closes. - -:::note - -The extended Pulsar Function SDK for Java is available in Pulsar 2.10.0 and later versions. -Before using it, you need to set up Pulsar Function worker 2.10.0 or later versions. - -::: - -The following example uses the extended interface of Pulsar Function SDK for Java to initialize RedisClient when the function instance starts and release it when the function instance closes. - -````mdx-code-block - - - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; -import io.lettuce.core.RedisClient; - -public class InitializableFunction implements Function { - private RedisClient redisClient; - - private void initRedisClient(Map connectInfo) { - redisClient = RedisClient.create(connectInfo.get("redisURI")); - } - - @Override - public void initialize(Context context) { - Map connectInfo = context.getUserConfigMap(); - redisClient = initRedisClient(connectInfo); - } - - @Override - public String process(String input, Context context) { - String value = client.get(key); - return String.format("%s-%s", input, value); - } - - @Override - public void close() { - redisClient.close(); - } -} - -``` - - - - -```` - -## Schema registry -Pulsar has a built-in schema registry and is bundled with popular schema types, such as Avro, JSON and Protobuf. Pulsar Functions can leverage the existing schema information from input topics and derive the input type. The schema registry applies for output topic as well. - -## SerDe -SerDe stands for **Ser**ialization and **De**serialization. Pulsar Functions uses SerDe when publishing data to and consuming data from Pulsar topics. How SerDe works by default depends on the language you use for a particular function. - -````mdx-code-block - - - -When you write Pulsar Functions in Java, the following basic Java types are built in and supported by default: `String`, `Double`, `Integer`, `Float`, `Long`, `Short`, and `Byte`. - -To customize Java types, you need to implement the following interface. - -```java - -public interface SerDe { - T deserialize(byte[] input); - byte[] serialize(T input); -} - -``` - -SerDe works in the following ways in Java Functions. -- If the input and output topics have schema, Pulsar Functions use schema for SerDe. -- If the input or output topics do not exist, Pulsar Functions adopt the following rules to determine SerDe: - - If the schema type is specified, Pulsar Functions use the specified schema type. - - If SerDe is specified, Pulsar Functions use the specified SerDe, and the schema type for input and output topics is `Byte`. - - If neither the schema type nor SerDe is specified, Pulsar Functions use the built-in SerDe. For non-primitive schema type, the built-in SerDe serializes and deserializes objects in the `JSON` format. - - - - -In Python, the default SerDe is identity, meaning that the type is serialized as whatever type the producer function returns. - -You can specify the SerDe when [creating](functions-deploy.md#cluster-mode) or [running](functions-deploy.md#local-run-mode) functions. - -```bash - -$ bin/pulsar-admin functions create \ - --tenant public \ - --namespace default \ - --name my_function \ - --py my_function.py \ - --classname my_function.MyFunction \ - --custom-serde-inputs '{"input-topic-1":"Serde1","input-topic-2":"Serde2"}' \ - --output-serde-classname Serde3 \ - --output output-topic-1 - -``` - -This case contains two input topics: `input-topic-1` and `input-topic-2`, each of which is mapped to a different SerDe class (the map must be specified as a JSON string). The output topic, `output-topic-1`, uses the `Serde3` class for SerDe. At the moment, all Pulsar Functions logic, include processing function and SerDe classes, must be contained within a single Python file. - -When using Pulsar Functions for Python, you have three SerDe options: - -1. You can use the [`IdentitySerde`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L70), which leaves the data unchanged. The `IdentitySerDe` is the **default**. Creating or running a function without explicitly specifying SerDe means that this option is used. -2. You can use the [`PickleSerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L62), which uses Python [`pickle`](https://docs.python.org/3/library/pickle.html) for SerDe. -3. You can create a custom SerDe class by implementing the baseline [`SerDe`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L50) class, which has just two methods: [`serialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L53) for converting the object into bytes, and [`deserialize`](https://github.com/apache/pulsar/blob/master/pulsar-client-cpp/python/pulsar/functions/serde.py#L58) for converting bytes into an object of the required application-specific type. - -The table below shows when you should use each SerDe. - -SerDe option | When to use -:------------|:----------- -`IdentitySerde` | When you work with simple types like strings, Booleans, integers. -`PickleSerDe` | When you work with complex, application-specific types and are comfortable with the "best effort" approach of `pickle`. -Custom SerDe | When you require explicit control over SerDe, potentially for performance or data compatibility purposes. - - - - -Currently, the feature is not available in Go. - - - - -```` - -### Example -Imagine that you're writing Pulsar Functions that are processing tweet objects, you can refer to the following example of `Tweet` class. - -````mdx-code-block - - - -```java - -public class Tweet { - private String username; - private String tweetContent; - - public Tweet(String username, String tweetContent) { - this.username = username; - this.tweetContent = tweetContent; - } - - // Standard setters and getters -} - -``` - -To pass `Tweet` objects directly between Pulsar Functions, you need to provide a custom SerDe class. In the example below, `Tweet` objects are basically strings in which the username and tweet content are separated by a `|`. - -```java - -package com.example.serde; - -import org.apache.pulsar.functions.api.SerDe; - -import java.util.regex.Pattern; - -public class TweetSerde implements SerDe { - public Tweet deserialize(byte[] input) { - String s = new String(input); - String[] fields = s.split(Pattern.quote("|")); - return new Tweet(fields[0], fields[1]); - } - - public byte[] serialize(Tweet input) { - return "%s|%s".format(input.getUsername(), input.getTweetContent()).getBytes(); - } -} - -``` - -To apply this customized SerDe to a particular Pulsar Function, you need to: - -* Package the `Tweet` and `TweetSerde` classes into a JAR. -* Specify a path to the JAR and SerDe class name when deploying the function. - -The following is an example of [`create`](/tools/pulsar-admin/) operation. - -```bash - -$ bin/pulsar-admin functions create \ - --jar /path/to/your.jar \ - --output-serde-classname com.example.serde.TweetSerde \ - # Other function attributes - -``` - -> #### Custom SerDe classes must be packaged with your function JARs -> Pulsar does not store your custom SerDe classes separately from your Pulsar Functions. So you need to include your SerDe classes in your function JARs. If not, Pulsar returns an error. - - - - -```python - -class Tweet(object): - def __init__(self, username, tweet_content): - self.username = username - self.tweet_content = tweet_content - -``` - -In order to use this class in Pulsar Functions, you have two options: - -1. You can specify `PickleSerDe`, which applies the [`pickle`](https://docs.python.org/3/library/pickle.html) library SerDe. -2. You can create your own SerDe class. The following is an example. - - ```python - - from pulsar import SerDe - - class TweetSerDe(SerDe): - - def serialize(self, input): - return bytes("{0}|{1}".format(input.username, input.tweet_content)) - - def deserialize(self, input_bytes): - tweet_components = str(input_bytes).split('|') - return Tweet(tweet_components[0], tweet_componentsp[1]) - - ``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/master/pulsar-functions/python-examples/custom_object_function.py). - - - - -```` - -In both languages, however, you can write custom SerDe logic for more complex, application-specific types. - -## Context -Java, Python and Go SDKs provide access to a **context object** that can be used by a function. This context object provides a wide variety of information and functionality to the function. - -* The name and ID of a Pulsar Function. -* The message ID of each message. Each Pulsar message is automatically assigned with an ID. -* The key, event time, properties and partition key of each message. -* The name of the topic to which the message is sent. -* The names of all input topics as well as the output topic associated with the function. -* The name of the class used for [SerDe](#serde). -* The [tenant](reference-terminology.md#tenant) and namespace associated with the function. -* The ID of the Pulsar Functions instance running the function. -* The version of the function. -* The [logger object](functions-develop.md#logger) used by the function, which can be used to create function log messages. -* Access to arbitrary [user configuration](#user-config) values supplied via the CLI. -* An interface for recording [metrics](#metrics). -* An interface for storing and retrieving state in [state storage](#state-storage). -* A function to publish new messages onto arbitrary topics. -* A function to ack the message being processed (if auto-ack is disabled). -* (Java) get Pulsar admin client. - -````mdx-code-block - - - -The [Context](https://github.com/apache/pulsar/blob/master/pulsar-functions/api-java/src/main/java/org/apache/pulsar/functions/api/Context.java) interface provides a number of methods that you can use to access the function [context](#context). The various method signatures for the `Context` interface are listed as follows. - -```java - -public interface Context { - Record getCurrentRecord(); - Collection getInputTopics(); - String getOutputTopic(); - String getOutputSchemaType(); - String getTenant(); - String getNamespace(); - String getFunctionName(); - String getFunctionId(); - String getInstanceId(); - String getFunctionVersion(); - Logger getLogger(); - void incrCounter(String key, long amount); - void incrCounterAsync(String key, long amount); - long getCounter(String key); - long getCounterAsync(String key); - void putState(String key, ByteBuffer value); - void putStateAsync(String key, ByteBuffer value); - void deleteState(String key); - ByteBuffer getState(String key); - ByteBuffer getStateAsync(String key); - Map getUserConfigMap(); - Optional getUserConfigValue(String key); - Object getUserConfigValueOrDefault(String key, Object defaultValue); - void recordMetric(String metricName, double value); - CompletableFuture publish(String topicName, O object, String schemaOrSerdeClassName); - CompletableFuture publish(String topicName, O object); - TypedMessageBuilder newOutputMessage(String topicName, Schema schema) throws PulsarClientException; - ConsumerBuilder newConsumerBuilder(Schema schema) throws PulsarClientException; - PulsarAdmin getPulsarAdmin(); - PulsarAdmin getPulsarAdmin(String clusterName); -} - -``` - -The following example uses several methods available via the `Context` object. - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; -import org.slf4j.Logger; - -import java.util.stream.Collectors; - -public class ContextFunction implements Function { - public Void process(String input, Context context) { - Logger LOG = context.getLogger(); - String inputTopics = context.getInputTopics().stream().collect(Collectors.joining(", ")); - String functionName = context.getFunctionName(); - - String logMessage = String.format("A message with a value of \"%s\" has arrived on one of the following topics: %s\n", - input, - inputTopics); - - LOG.info(logMessage); - - String metricName = String.format("function-%s-messages-received", functionName); - context.recordMetric(metricName, 1); - - return null; - } -} - -``` - - - - -``` - -class ContextImpl(pulsar.Context): - def get_message_id(self): - ... - def get_message_key(self): - ... - def get_message_eventtime(self): - ... - def get_message_properties(self): - ... - def get_current_message_topic_name(self): - ... - def get_partition_key(self): - ... - def get_function_name(self): - ... - def get_function_tenant(self): - ... - def get_function_namespace(self): - ... - def get_function_id(self): - ... - def get_instance_id(self): - ... - def get_function_version(self): - ... - def get_logger(self): - ... - def get_user_config_value(self, key): - ... - def get_user_config_map(self): - ... - def record_metric(self, metric_name, metric_value): - ... - def get_input_topics(self): - ... - def get_output_topic(self): - ... - def get_output_serde_class_name(self): - ... - def publish(self, topic_name, message, serde_class_name="serde.IdentitySerDe", - properties=None, compression_type=None, callback=None, message_conf=None): - ... - def ack(self, msgid, topic): - ... - def get_and_reset_metrics(self): - ... - def reset_metrics(self): - ... - def get_metrics(self): - ... - def incr_counter(self, key, amount): - ... - def get_counter(self, key): - ... - def del_counter(self, key): - ... - def put_state(self, key, value): - ... - def get_state(self, key): - ... - -``` - - - - -``` - -func (c *FunctionContext) GetInstanceID() int { - return c.instanceConf.instanceID -} - -func (c *FunctionContext) GetInputTopics() []string { - return c.inputTopics -} - -func (c *FunctionContext) GetOutputTopic() string { - return c.instanceConf.funcDetails.GetSink().Topic -} - -func (c *FunctionContext) GetFuncTenant() string { - return c.instanceConf.funcDetails.Tenant -} - -func (c *FunctionContext) GetFuncName() string { - return c.instanceConf.funcDetails.Name -} - -func (c *FunctionContext) GetFuncNamespace() string { - return c.instanceConf.funcDetails.Namespace -} - -func (c *FunctionContext) GetFuncID() string { - return c.instanceConf.funcID -} - -func (c *FunctionContext) GetFuncVersion() string { - return c.instanceConf.funcVersion -} - -func (c *FunctionContext) GetUserConfValue(key string) interface{} { - return c.userConfigs[key] -} - -func (c *FunctionContext) GetUserConfMap() map[string]interface{} { - return c.userConfigs -} - -func (c *FunctionContext) SetCurrentRecord(record pulsar.Message) { - c.record = record -} - -func (c *FunctionContext) GetCurrentRecord() pulsar.Message { - return c.record -} - -func (c *FunctionContext) NewOutputMessage(topic string) pulsar.Producer { - return c.outputMessage(topic) -} - -``` - -The following example uses several methods available via the `Context` object. - -``` - -import ( - "context" - "fmt" - - "github.com/apache/pulsar/pulsar-function-go/pf" -) - -func contextFunc(ctx context.Context) { - if fc, ok := pf.FromContext(ctx); ok { - fmt.Printf("function ID is:%s, ", fc.GetFuncID()) - fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) - } -} - -``` - -For complete code, see [here](https://github.com/apache/pulsar/blob/77cf09eafa4f1626a53a1fe2e65dd25f377c1127/pulsar-function-go/examples/contextFunc/contextFunc.go#L29-L34). - - - - -```` - -### User config -When you run or update Pulsar Functions created using SDK, you can pass arbitrary key/values to them with the command line with the `--user-config` flag. Key/values must be specified as JSON. The following function creation command passes a user configured key/value to a function. - -```bash - -$ bin/pulsar-admin functions create \ - --name word-filter \ - # Other function configs - --user-config '{"forbidden-word":"rosebud"}' - -``` - -````mdx-code-block - - - -The Java SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. - -```bash - -$ bin/pulsar-admin functions create \ - # Other function configs - --user-config '{"word-of-the-day":"verdure"}' - -``` - -To access that value in a Java function: - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; -import org.slf4j.Logger; - -import java.util.Optional; - -public class UserConfigFunction implements Function { - @Override - public void apply(String input, Context context) { - Logger LOG = context.getLogger(); - Optional wotd = context.getUserConfigValue("word-of-the-day"); - if (wotd.isPresent()) { - LOG.info("The word of the day is {}", wotd); - } else { - LOG.warn("No word of the day provided"); - } - return null; - } -} - -``` - -The `UserConfigFunction` function will log the string `"The word of the day is verdure"` every time the function is invoked (which means every time a message arrives). The `word-of-the-day` user config will be changed only when the function is updated with a new config value via the command line. - -You can also access the entire user config map or set a default value in case no value is present: - -```java - -// Get the whole config map -Map allConfigs = context.getUserConfigMap(); - -// Get value or resort to default -String wotd = context.getUserConfigValueOrDefault("word-of-the-day", "perspicacious"); - -``` - -> For all key/value pairs passed to Java functions, both the key *and* the value are `String`. To set the value to be a different type, you need to deserialize from the `String` type. - - - - -In Python function, you can access the configuration value like this. - -```python - -from pulsar import Function - -class WordFilter(Function): - def process(self, context, input): - forbidden_word = context.user_config()["forbidden-word"] - - # Don't publish the message if it contains the user-supplied - # forbidden word - if forbidden_word in input: - pass - # Otherwise publish the message - else: - return input - -``` - -The Python SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. - -```bash - -$ bin/pulsar-admin functions create \ - # Other function configs \ - --user-config '{"word-of-the-day":"verdure"}' - -``` - -To access that value in a Python function: - -```python - -from pulsar import Function - -class UserConfigFunction(Function): - def process(self, input, context): - logger = context.get_logger() - wotd = context.get_user_config_value('word-of-the-day') - if wotd is None: - logger.warn('No word of the day provided') - else: - logger.info("The word of the day is {0}".format(wotd)) - -``` - - - - -The Go SDK [`Context`](#context) object enables you to access key/value pairs provided to Pulsar Functions via the command line (as JSON). The following example passes a key/value pair. - -```bash - -$ bin/pulsar-admin functions create \ - --go path/to/go/binary - --user-config '{"word-of-the-day":"lackadaisical"}' - -``` - -To access that value in a Go function: - -```go - -func contextFunc(ctx context.Context) { - fc, ok := pf.FromContext(ctx) - if !ok { - logutil.Fatal("Function context is not defined") - } - - wotd := fc.GetUserConfValue("word-of-the-day") - - if wotd == nil { - logutil.Warn("The word of the day is empty") - } else { - logutil.Infof("The word of the day is %s", wotd.(string)) - } -} - -``` - - - - -```` - -### Logger - -````mdx-code-block - - - -Pulsar Functions that use the Java SDK have access to an [SLF4j](https://www.slf4j.org/) [`Logger`](https://www.slf4j.org/api/org/apache/log4j/Logger.html) object that can be used to produce logs at the chosen log level. The following example logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; -import org.slf4j.Logger; - -public class LoggingFunction implements Function { - @Override - public void apply(String input, Context context) { - Logger LOG = context.getLogger(); - String messageId = new String(context.getMessageId()); - - if (input.contains("danger")) { - LOG.warn("A warning was received in message {}", messageId); - } else { - LOG.info("Message {} received\nContent: {}", messageId, input); - } - - return null; - } -} - -``` - -If you want your function to produce logs, you need to specify a log topic when creating or running the function. The following is an example. - -```bash - -$ bin/pulsar-admin functions create \ - --jar my-functions.jar \ - --classname my.package.LoggingFunction \ - --log-topic persistent://public/default/logging-function-logs \ - # Other function configs - -``` - -All logs produced by `LoggingFunction` above can be accessed via the `persistent://public/default/logging-function-logs` topic. - -#### Customize Function log level -Additionally, you can use the XML file, `functions_log4j2.xml`, to customize the function log level. -To customize the function log level, create or update `functions_log4j2.xml` in your Pulsar conf directory (for example, `/etc/pulsar/` on bare-metal, or `/pulsar/conf` on Kubernetes) to contain contents such as: - -```xml - - - pulsar-functions-instance - 30 - - - pulsar.log.appender - RollingFile - - - pulsar.log.level - debug - - - bk.log.level - debug - - - - - Console - SYSTEM_OUT - - %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n - - - - RollingFile - ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.log - ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}-%d{MM-dd-yyyy}-%i.log.gz - true - - %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n - - - - 1 - true - - - 1 GB - - - 0 0 0 * * ? - - - - - ${sys:pulsar.function.log.dir} - 2 - - */${sys:pulsar.function.log.file}*log.gz - - - 30d - - - - - - BkRollingFile - ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk - ${sys:pulsar.function.log.dir}/${sys:pulsar.function.log.file}.bk-%d{MM-dd-yyyy}-%i.log.gz - true - - %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n - - - - 1 - true - - - 1 GB - - - 0 0 0 * * ? - - - - - ${sys:pulsar.function.log.dir} - 2 - - */${sys:pulsar.function.log.file}.bk*log.gz - - - 30d - - - - - - - - org.apache.pulsar.functions.runtime.shaded.org.apache.bookkeeper - ${sys:bk.log.level} - false - - BkRollingFile - - - - ${sys:pulsar.log.level} - - ${sys:pulsar.log.appender} - ${sys:pulsar.log.level} - - - - - -``` - -The properties set like: - -```xml - - - pulsar.log.level - debug - - -``` - -propagate to places where they are referenced, such as: - -```xml - - - ${sys:pulsar.log.level} - - ${sys:pulsar.log.appender} - ${sys:pulsar.log.level} - - - -``` - -In the above example, debug level logging would be applied to ALL function logs. -This may be more verbose than you desire. To be more selective, you can apply different log levels to different classes or modules. For example: - -```xml - - - com.example.module - info - false - - ${sys:pulsar.log.appender} - - - -``` - -You can be more specific as well, such as applying a more verbose log level to a class in the module, such as: - -```xml - - - com.example.module.className - debug - false - - Console - - - -``` - -Each `` entry allows you to output the log to a target specified in the definition of the Appender. - -Additivity pertains to whether log messages will be duplicated if multiple Logger entries overlap. -To disable additivity, specify - -```xml - -false - -``` - -as shown in examples above. Disabling additivity prevents duplication of log messages when one or more `` entries contain classes or modules that overlap. - -The `` is defined in the `` section, such as: - -```xml - - - Console - SYSTEM_OUT - - %d{ISO8601_OFFSET_DATE_TIME_HHMM} [%t] %-5level %logger{36} - %msg%n - - - -``` - - - - -Pulsar Functions that use the Python SDK have access to a logging object that can be used to produce logs at the chosen log level. The following example function that logs either a `WARNING`- or `INFO`-level log based on whether the incoming string contains the word `danger`. - -```python - -from pulsar import Function - -class LoggingFunction(Function): - def process(self, input, context): - logger = context.get_logger() - msg_id = context.get_message_id() - if 'danger' in input: - logger.warn("A warning was received in message {0}".format(context.get_message_id())) - else: - logger.info("Message {0} received\nContent: {1}".format(msg_id, input)) - -``` - -If you want your function to produce logs on a Pulsar topic, you need to specify a **log topic** when creating or running the function. The following is an example. - -```bash - -$ bin/pulsar-admin functions create \ - --py logging_function.py \ - --classname logging_function.LoggingFunction \ - --log-topic logging-function-logs \ - # Other function configs - -``` - -All logs produced by `LoggingFunction` above can be accessed via the `logging-function-logs` topic. -Additionally, you can specify the function log level through the broker XML file as described in [Customize Function log level](#customize-function-log-level). - - - - -The following Go Function example shows different log levels based on the function input. - -``` - -import ( - "context" - - "github.com/apache/pulsar/pulsar-function-go/pf" - - log "github.com/apache/pulsar/pulsar-function-go/logutil" -) - -func loggerFunc(ctx context.Context, input []byte) { - if len(input) <= 100 { - log.Infof("This input has a length of: %d", len(input)) - } else { - log.Warnf("This input is getting too long! It has {%d} characters", len(input)) - } -} - -func main() { - pf.Start(loggerFunc) -} - -``` - -When you use `logTopic` related functionalities in Go Function, import `github.com/apache/pulsar/pulsar-function-go/logutil`, and you do not have to use the `getLogger()` context object. - -Additionally, you can specify the function log level through the broker XML file, as described here: [Customize Function log level](#customize-function-log-level) - - - - -```` - -### Pulsar admin - -Pulsar Functions using the Java SDK has access to the Pulsar admin client, which allows the Pulsar admin client to manage API calls to current Pulsar clusters or external clusters (if `external-pulsars` is provided). - -````mdx-code-block - - - -Below is an example of how to use the Pulsar admin client exposed from the Function `context`. - -``` - -import org.apache.pulsar.client.admin.PulsarAdmin; -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; - -/** - * In this particular example, for every input message, - * the function resets the cursor of the current function's subscription to a - * specified timestamp. - */ -public class CursorManagementFunction implements Function { - - @Override - public String process(String input, Context context) throws Exception { - PulsarAdmin adminClient = context.getPulsarAdmin(); - if (adminClient != null) { - String topic = context.getCurrentRecord().getTopicName().isPresent() ? - context.getCurrentRecord().getTopicName().get() : null; - String subName = context.getTenant() + "/" + context.getNamespace() + "/" + context.getFunctionName(); - if (topic != null) { - // 1578188166 below is a random-pick timestamp - adminClient.topics().resetCursor(topic, subName, 1578188166); - return "reset cursor successfully"; - } - } - return null; - } -} - -``` - -If you want your function to get access to the Pulsar admin client, you need to enable this feature by setting `exposeAdminClientEnabled=true` in the `functions_worker.yml` file. You can test whether this feature is enabled or not using the command `pulsar-admin functions localrun` with the flag `--web-service-url`. - -``` - -$ bin/pulsar-admin functions localrun \ - --jar my-functions.jar \ - --classname my.package.CursorManagementFunction \ - --web-service-url http://pulsar-web-service:8080 \ - # Other function configs - -``` - - - - -```` - -## Metrics - -Pulsar Functions allows you to deploy and manage processing functions that consume messages from and publish messages to Pulsar topics easily. It is important to ensure that the running functions are healthy at any time. Pulsar Functions can publish arbitrary metrics to the metrics interface which can be queried. - -:::note - -If a Pulsar Function uses the language-native interface for Java or Python, that function is not able to publish metrics and stats to Pulsar. - -::: - -You can monitor Pulsar Functions that have been deployed with the following methods: - -- Check the metrics provided by Pulsar. - - Pulsar Functions expose the metrics that can be collected and used for monitoring the health of **Java, Python, and Go** functions. You can check the metrics by following the [monitoring](deploy-monitoring) guide. - - For the complete list of the function metrics, see [here](reference-metrics.md#pulsar-functions). - -- Set and check your customized metrics. - - In addition to the metrics provided by Pulsar, Pulsar allows you to customize metrics for **Java and Python** functions. Function workers collect user-defined metrics to Prometheus automatically and you can check them in Grafana. - -Here are examples of how to customize metrics for Java and Python functions. - -````mdx-code-block - - - -You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; - -public class MetricRecorderFunction implements Function { - @Override - public void apply(Integer input, Context context) { - // Records the metric 1 every time a message arrives - context.recordMetric("hit-count", 1); - - // Records the metric only if the arriving number equals 11 - if (input == 11) { - context.recordMetric("elevens-count", 1); - } - - return null; - } -} - -``` - - - - -You can record metrics using the [`Context`](#context) object on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message. The following is an example. - -```python - -from pulsar import Function - -class MetricRecorderFunction(Function): - def process(self, input, context): - context.record_metric('hit-count', 1) - - if input == 11: - context.record_metric('elevens-count', 1) - -``` - - - - -The Go SDK [`Context`](#context) object enables you to record metrics on a per-key basis. For example, you can set a metric for the `process-count` key and a different metric for the `elevens-count` key every time the function processes a message: - -```go - -func metricRecorderFunction(ctx context.Context, in []byte) error { - inputstr := string(in) - fctx, ok := pf.FromContext(ctx) - if !ok { - return errors.New("get Go Functions Context error") - } - fctx.RecordMetric("hit-count", 1) - if inputstr == "eleven" { - fctx.RecordMetric("elevens-count", 1) - } - return nil -} - -``` - - - - -```` - -## Security - -If you want to enable security on Pulsar Functions, first you should enable security on [Functions Workers](functions-worker). For more details, refer to [Security settings](functions-worker.md#security-settings). - -Pulsar Functions can support the following providers: - -- ClearTextSecretsProvider -- EnvironmentBasedSecretsProvider - -> Pulsar Function supports ClearTextSecretsProvider by default. - -At the same time, Pulsar Functions provides two interfaces, **SecretsProvider** and **SecretsProviderConfigurator**, allowing users to customize secret provider. - -````mdx-code-block - - - -You can get secret provider using the [`Context`](#context) object. The following is an example: - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; -import org.slf4j.Logger; - -public class GetSecretProviderFunction implements Function { - - @Override - public Void process(String input, Context context) throws Exception { - Logger LOG = context.getLogger(); - String secretProvider = context.getSecret(input); - - if (!secretProvider.isEmpty()) { - LOG.info("The secret provider is {}", secretProvider); - } else { - LOG.warn("No secret provider"); - } - - return null; - } -} - -``` - - - - -You can get secret provider using the [`Context`](#context) object. The following is an example: - -```python - -from pulsar import Function - -class GetSecretProviderFunction(Function): - def process(self, input, context): - logger = context.get_logger() - secret_provider = context.get_secret(input) - if secret_provider is None: - logger.warn('No secret provider') - else: - logger.info("The secret provider is {0}".format(secret_provider)) - -``` - - - - -Currently, the feature is not available in Go. - - - - -```` - -## State storage -Pulsar Functions use [Apache BookKeeper](https://bookkeeper.apache.org) as a state storage interface. Pulsar installation, including the local standalone installation, includes deployment of BookKeeper bookies. - -Since Pulsar 2.1.0 release, Pulsar integrates with Apache BookKeeper [table service](https://docs.google.com/document/d/155xAwWv5IdOitHh1NVMEwCMGgB28M3FyMiQSxEpjE-Y/edit#heading=h.56rbh52koe3f) to store the `State` for functions. For example, a `WordCount` function can store its `counters` state into BookKeeper table service via Pulsar Functions State API. - -States are key-value pairs, where the key is a string and the value is arbitrary binary data - counters are stored as 64-bit big-endian binary values. Keys are scoped to an individual Pulsar Function, and shared between instances of that function. - -You can access states within Pulsar Java Functions using the `putState`, `putStateAsync`, `getState`, `getStateAsync`, `incrCounter`, `incrCounterAsync`, `getCounter`, `getCounterAsync` and `deleteState` calls on the context object. You can access states within Pulsar Python Functions using the `putState`, `getState`, `incrCounter`, `getCounter` and `deleteState` calls on the context object. You can also manage states using the [querystate](#query-state) and [putstate](#putstate) options to `pulsar-admin functions`. - -:::note - -State storage is not available in Go. - -::: - -### API - -````mdx-code-block - - - -Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](functions-develop.md#context) object when you are using Java SDK functions. - -#### incrCounter - -```java - - /** - * Increment the builtin distributed counter referred by key - * @param key The name of the key - * @param amount The amount to be incremented - */ - void incrCounter(String key, long amount); - -``` - -The application can use `incrCounter` to change the counter of a given `key` by the given `amount`. - -#### incrCounterAsync - -```java - - /** - * Increment the builtin distributed counter referred by key - * but dont wait for the completion of the increment operation - * - * @param key The name of the key - * @param amount The amount to be incremented - */ - CompletableFuture incrCounterAsync(String key, long amount); - -``` - -The application can use `incrCounterAsync` to asynchronously change the counter of a given `key` by the given `amount`. - -#### getCounter - -```java - - /** - * Retrieve the counter value for the key. - * - * @param key name of the key - * @return the amount of the counter value for this key - */ - long getCounter(String key); - -``` - -The application can use `getCounter` to retrieve the counter of a given `key` mutated by `incrCounter`. - -Except the `counter` API, Pulsar also exposes a general key/value API for functions to store -general key/value state. - -#### getCounterAsync - -```java - - /** - * Retrieve the counter value for the key, but don't wait - * for the operation to be completed - * - * @param key name of the key - * @return the amount of the counter value for this key - */ - CompletableFuture getCounterAsync(String key); - -``` - -The application can use `getCounterAsync` to asynchronously retrieve the counter of a given `key` mutated by `incrCounterAsync`. - -#### putState - -```java - - /** - * Update the state value for the key. - * - * @param key name of the key - * @param value state value of the key - */ - void putState(String key, ByteBuffer value); - -``` - -#### putStateAsync - -```java - - /** - * Update the state value for the key, but don't wait for the operation to be completed - * - * @param key name of the key - * @param value state value of the key - */ - CompletableFuture putStateAsync(String key, ByteBuffer value); - -``` - -The application can use `putStateAsync` to asynchronously update the state of a given `key`. - -#### getState - -```java - - /** - * Retrieve the state value for the key. - * - * @param key name of the key - * @return the state value for the key. - */ - ByteBuffer getState(String key); - -``` - -#### getStateAsync - -```java - - /** - * Retrieve the state value for the key, but don't wait for the operation to be completed - * - * @param key name of the key - * @return the state value for the key. - */ - CompletableFuture getStateAsync(String key); - -``` - -The application can use `getStateAsync` to asynchronously retrieve the state of a given `key`. - -#### deleteState - -```java - - /** - * Delete the state value for the key. - * - * @param key name of the key - */ - -``` - -Counters and binary values share the same keyspace, so this deletes either type. - - - - -Currently Pulsar Functions expose the following APIs for mutating and accessing State. These APIs are available in the [Context](#context) object when you are using Python SDK functions. - -#### incr_counter - -```python - - def incr_counter(self, key, amount): - """incr the counter of a given key in the managed state""" - -``` - -Application can use `incr_counter` to change the counter of a given `key` by the given `amount`. -If the `key` does not exist, a new key is created. - -#### get_counter - -```python - - def get_counter(self, key): - """get the counter of a given key in the managed state""" - -``` - -Application can use `get_counter` to retrieve the counter of a given `key` mutated by `incrCounter`. - -Except the `counter` API, Pulsar also exposes a general key/value API for functions to store -general key/value state. - -#### put_state - -```python - - def put_state(self, key, value): - """update the value of a given key in the managed state""" - -``` - -The key is a string, and the value is arbitrary binary data. - -#### get_state - -```python - - def get_state(self, key): - """get the value of a given key in the managed state""" - -``` - -#### del_counter - -```python - - def del_counter(self, key): - """delete the counter of a given key in the managed state""" - -``` - -Counters and binary values share the same keyspace, so this deletes either type. - - - - -```` - -### Query State - -A Pulsar Function can use the [State API](#api) for storing state into Pulsar's state storage -and retrieving state back from Pulsar's state storage. Additionally Pulsar also provides -CLI commands for querying its state. - -```shell - -$ bin/pulsar-admin functions querystate \ - --tenant \ - --namespace \ - --name \ - --state-storage-url \ - --key \ - [---watch] - -``` - -If `--watch` is specified, the CLI will watch the value of the provided `state-key`. - -### Example - -````mdx-code-block - - - -{@inject: github:WordCountFunction:/pulsar-functions/java-examples/src/main/java/org/apache/pulsar/functions/api/examples/WordCountFunction.java} is a very good example -demonstrating on how Application can easily store `state` in Pulsar Functions. - -```java - -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; - -import java.util.Arrays; - -public class WordCountFunction implements Function { - @Override - public Void process(String input, Context context) throws Exception { - Arrays.asList(input.split("\\.")).forEach(word -> context.incrCounter(word, 1)); - return null; - } -} - -``` - -The logic of this `WordCount` function is pretty simple and straightforward: - -1. The function first splits the received `String` into multiple words using regex `\\.`. -2. For each `word`, the function increments the corresponding `counter` by 1 (via `incrCounter(key, amount)`). - - - - -```python - -from pulsar import Function - -class WordCount(Function): - def process(self, item, context): - for word in item.split(): - context.incr_counter(word, 1) - -``` - -The logic of this `WordCount` function is pretty simple and straightforward: - -1. The function first splits the received string into multiple words on space. -2. For each `word`, the function increments the corresponding `counter` by 1 (via `incr_counter(key, amount)`). - - - - -```` +To develop Pulsar Functions, you can leverage the following tools and features. +* [APIs](functions-develop-api) +* [User-defined configs](functions-develop-user-defined-configs) +* [Logs](functions-develop-log) +* [Metrics](functions-develop-metrics) +* [Security](functions-develop-security) +* [Stage storage](functions-develop-state) +* [Admin APIs](functions-develop-admin-api) +* [Schema registry](functions-schema-registry) +* [SerDe](functions-develop-serde) \ No newline at end of file diff --git a/site2/docs/functions-overview.md b/site2/docs/functions-overview.md index 25e9bd7a56e52..d8054edb12707 100644 --- a/site2/docs/functions-overview.md +++ b/site2/docs/functions-overview.md @@ -4,205 +4,83 @@ title: Pulsar Functions overview sidebar_label: "Overview" --- -**Pulsar Functions** are lightweight compute processes that +This section introduces the following content: +* [What is Pulsar Functions](#what-is-pulsar-functions) +* [Why use Puslar Functions](#why-use-pulsar-functions) +* [Use cases](#use-cases) +* [User flow](#user-flow) -* consume messages from one or more Pulsar topics, -* apply a user-supplied processing logic to each message, -* publish the results of the computation to another topic. +## What are Pulsar Functions -## Goals -With Pulsar Functions, you can create complex processing logic without deploying a separate neighboring system (such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://heron.incubator.apache.org/), [Apache Flink](https://flink.apache.org/)). Pulsar Functions are computing infrastructure of Pulsar messaging system. The core goal is tied to a series of other goals: +Pulsar Functions are a serverless computing framework that runs on top of Pulsar and processes messages in the following way: +* consumes messages from one or more topics, +* applies a user-defined processing logic to the messages, +* publishes the outputs of the messages to other topics. -* Developer productivity (language-native vs Pulsar Functions SDK functions) -* Easy troubleshooting -* Operational simplicity (no need for an external processing system) - -## Inspirations -Pulsar Functions are inspired by (and take cues from) several systems and paradigms: - -* Stream processing engines such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://apache.github.io/incubator-heron), and [Apache Flink](https://flink.apache.org) -* "Serverless" and "Function as a Service" (FaaS) cloud platforms like [Amazon Web Services Lambda](https://aws.amazon.com/lambda/), [Google Cloud Functions](https://cloud.google.com/functions/), and [Azure Cloud Functions](https://azure.microsoft.com/en-us/services/functions/) - -Pulsar Functions can be described as - -* [Lambda](https://aws.amazon.com/lambda/)-style functions that are -* specifically designed to use Pulsar as a message bus. - -## Programming model -Pulsar Functions provide a wide range of functionality, and the core programming model is simple. Functions receive messages from one or more **input [topics](reference-terminology.md#topic)**. Each time a message is received, the function will complete the following tasks. +The following figure illustrates the computing process of a function. - * Apply some processing logic to the input and write output to: - * An **output topic** in Pulsar - * [Apache BookKeeper](functions-develop.md#state-storage) - * Write logs to a **log topic** (potentially for debugging purposes) - * Increment a [counter](#word-count-example) +![Pulsar Functions execute user-defined code on data published to Pulsar topics](/assets/function-overview.svg) -![Pulsar Functions core programming model](/assets/pulsar-functions-overview.png) - -You can use Pulsar Functions to set up the following processing chain: - -* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous whitespace and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. -* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time window, and publishes the results to a `results` topic -* Finally, a Python function listens for the `results` topic and writes the results to a MySQL table. - - -### Word count example +A function receives messages from one or more **input topics**. Each time messages are received, the function completes the following steps: +1. Consumes the messages in the input topics. +2. Applies a customized processing logic to the messages and: + a) writes output messages to an **output topic** in Pulsar + b) writes logs to a **log topic** if it is configured (for debugging purposes) + c) writes [state](functions-develop-state) to BookKeeper (if it is configured) -If you implement the classic word count example using Pulsar Functions, it looks something like this: -![Pulsar Functions word count example](/assets/pulsar-functions-word-count.png) +You can write functions in Java, Python, and Go. For example, you can use Pulsar Functions to set up the following processing chain: +* A Python function listens for the `raw-sentences` topic and "sanitizes" incoming strings (removing extraneous white space and converting all characters to lowercase) and then publishes the results to a `sanitized-sentences` topic. +* A Java function listens for the `sanitized-sentences` topic, counts the number of times each word appears within a specified time [window](functions-concepts.md#window-function), and publishes the results to a `results` topic. +* A Python function listens for the `results` topic and writes the results to a MySQL table. -To write the function in Java with [Pulsar Functions SDK for Java](functions-develop.md#available-apis), you can write the function as follows. +See [Develop Pulsar Functions](functions-develop.md) for more details. -```java -package org.example.functions; +## Why use Pulsar Functions -import org.apache.pulsar.functions.api.Context; -import org.apache.pulsar.functions.api.Function; +Pulsar Functions provide the capabilities to perform simple computations on the messages before they are routed to consumers. -import java.util.Arrays; +Pulsar Functions can be characterized as Lambda-style functions that are specifically designed and integrated with Pulsar as the underlying message bus. The framework of Pulsar Functions provides a simple computing framework on your Pulsar cluster and takes care of the underlying details of sending/receiving messages. You only need to focus on the business logic and run it as Pulsar Functions to maximize the value of your data and enjoy the benefits of: +* Simplified deployment and operations - you can create a data pipeline without deploying a separate Stream Processing Engine (SPE), such as [Apache Storm](http://storm.apache.org/), [Apache Heron](https://heron.incubator.apache.org/), or [Apache Flink](https://flink.apache.org/). +* Serverless computing (when Kubernetes runtime is used) +* Maximized developer productivity (both language-native interfaces and SDKs for Java/Python/Go). +* Easy troubleshooting -public class WordCountFunction implements Function { - // This function is invoked every time a message is published to the input topic - @Override - public Void process(String input, Context context) throws Exception { - Arrays.asList(input.split(" ")).forEach(word -> { - String counterKey = word.toLowerCase(); - context.incrCounter(counterKey, 1); - }); - return null; - } -} -``` +## Use cases -Bundle and build the JAR file to be deployed, and then deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. +Here are two real-world use cases to help you understand the capabilities of Pulsar Functions and what they can be used for. -```bash +### Word count example -$ bin/pulsar-admin functions create \ - --jar target/my-jar-with-dependencies.jar \ - --classname org.example.functions.WordCountFunction \ - --tenant public \ - --namespace default \ - --name word-count \ - --inputs persistent://public/default/sentences \ - --output persistent://public/default/count +This figure illustrates the process of implementing the classic word count example using Pulsar Functions. It calculates a sum of the occurrences of every individual word published to a given topic. -``` +![Word count example using Pulsar Functions](/assets/pulsar-functions-word-count.png) ### Content-based routing example -Pulsar Functions are used in many cases. The following is a sophisticated example that involves content-based routing. - -For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. Or, if an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop.md#logger). The following is a visual representation. - -![Pulsar Functions routing example](/assets/pulsar-functions-routing-example.png) - -If you implement this routing functionality in Python, it looks something like this: - -```python - -from pulsar import Function - -class RoutingFunction(Function): - def __init__(self): - self.fruits_topic = "persistent://public/default/fruits" - self.vegetables_topic = "persistent://public/default/vegetables" - - @staticmethod - def is_fruit(item): - return item in [b"apple", b"orange", b"pear", b"other fruits..."] - - @staticmethod - def is_vegetable(item): - return item in [b"carrot", b"lettuce", b"radish", b"other vegetables..."] - - def process(self, item, context): - if self.is_fruit(item): - context.publish(self.fruits_topic, item) - elif self.is_vegetable(item): - context.publish(self.vegetables_topic, item) - else: - warning = "The item {0} is neither a fruit nor a vegetable".format(item) - context.get_logger().warn(warning) - -``` - -If this code is stored in `~/router.py`, then you can deploy it in your Pulsar cluster using the [command line](functions-deploy.md#command-line-interface) as follows. - -```bash - -$ bin/pulsar-admin functions create \ - --py ~/router.py \ - --classname router.RoutingFunction \ - --tenant public \ - --namespace default \ - --name route-fruit-veg \ - --inputs persistent://public/default/basket-items - -``` - -### Functions, messages and message types -Pulsar Functions take byte arrays as inputs and spit out byte arrays as output. However in languages that support typed interfaces(Java), you can write typed Functions, and bind messages to types in the following ways. -* [Schema Registry](functions-develop.md#schema-registry) -* [SerDe](functions-develop.md#serde) - - -## Fully Qualified Function Name (FQFN) -Each Pulsar Function has a **Fully Qualified Function Name** (FQFN) that consists of three elements: the function tenant, namespace, and function name. FQFN looks like this: - -```http - -tenant/namespace/name - -``` - -FQFNs enable you to create multiple functions with the same name provided that they are in different namespaces. - -## Supported languages -Currently, you can write Pulsar Functions in Java, Python, and Go. For details, refer to [Develop Pulsar Functions](functions-develop). - -## Processing guarantees -Pulsar Functions provide three different messaging semantics that you can apply to any function. - -Delivery semantics | Description -:------------------|:------- -**At-most-once** delivery | Each message sent to the function is likely to be processed, or not to be processed (hence "at most"). -**At-least-once** delivery | Each message sent to the function can be processed more than once (hence the "at least"). -**Effectively-once** delivery | Each message sent to the function will have one output associated with it. - - -### Apply processing guarantees to a function -You can set the processing guarantees for a Pulsar Function when you create the Function. The following [`pulsar-function create`](/tools/pulsar-admin/) command creates a function with effectively-once guarantees applied. - -```bash - -$ bin/pulsar-admin functions create \ - --name my-effectively-once-function \ - --processing-guarantees EFFECTIVELY_ONCE \ - # Other function configs - -``` - -The available options for `--processing-guarantees` are: +For example, a function takes items (strings) as input and publishes them to either a `fruits` or `vegetables` topic, depending on the item. If an item is neither fruit nor vegetable, a warning is logged to a [log topic](functions-develop-log.md). -* `ATMOST_ONCE` -* `ATLEAST_ONCE` -* `EFFECTIVELY_ONCE` +This figure demonstrates the process of implementing a content-based routing using Pulsar Functions. -> By default, Pulsar Functions provide at-least-once delivery guarantees. So if you create a function without supplying a value for the `--processingGuarantees` flag, the function provides at-least-once guarantees. +![Count-based routing example using Pulsar Functions](/assets/pulsar-functions-routing-example.png) -### Update the processing guarantees of a function -You can change the processing guarantees applied to a function using the [`update`](/tools/pulsar-admin/) command. The following is an example. +## User flow -```bash +**Admins/operators** +1. [Set up function workers](functions-worker.md). +2. [Configure function runtime](functions-runtime.md). +3. [Deploy a function](functions-deploy.md). -$ bin/pulsar-admin functions update \ - --processing-guarantees ATMOST_ONCE \ - # Other function configs +**Developers** +1. [Develop a function](functions-develop.md). +2. [Debug a function](functions-debug.md). +3. [Package a function](functions-package.md). +4. [Deploy a function](functions-deploy.md). -``` +**More reference** +* [Function concepts](functions-concepts.md) +* [Function CLIs and configs](functions-cli.md) diff --git a/site2/docs/functions-package-go.md b/site2/docs/functions-package-go.md new file mode 100644 index 0000000000000..ecef65fc5cf03 --- /dev/null +++ b/site2/docs/functions-package-go.md @@ -0,0 +1,55 @@ +--- +id: functions-package-go +title: Package Go Functions +sidebar_label: "Package Go Functions" +--- + +:::note + +Currently, Go functions can be implemented only using SDK and the interface of functions is exposed in the form of SDK. Before using Go functions, you need to import `github.com/apache/pulsar/pulsar-function-go/pf`. + +::: + +To package a Go function, complete the following steps. + +1. Prepare a Go function file. +2. Build the Go function. + + ```go + + go build .go + + ``` + +3. Copy the Go function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + + 4. Run the Go function with the following command. + + ```bash + + bin/pulsar-admin functions localrun \ + --go [your go function path] + --inputs [input topics] \ + --output [output topic] \ + --tenant [default:public] \ + --namespace [default:default] \ + --name [custom unique go function name] + + ``` + + The following log indicates that the Go function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` diff --git a/site2/docs/functions-package-java.md b/site2/docs/functions-package-java.md new file mode 100644 index 0000000000000..0c5c67b7ecb5b --- /dev/null +++ b/site2/docs/functions-package-java.md @@ -0,0 +1,116 @@ +--- +id: functions-package-java +title: Package Java Functions +sidebar_label: "Package Java Functions" +--- + +:::note + +For the runtime Java version, refer to [Pulsar Runtime Java Version Recommendation](https://github.com/apache/pulsar/blob/master/README.md#pulsar-runtime-java-version-recommendation) according to your target Pulsar version. + +::: + +To package a Java function, complete the following steps. + +1. Create a new maven project with a pom file. In the following code sample, the value of `mainClass` is your package name. + + ```java + + + + 4.0.0 + + java-function + java-function + 1.0-SNAPSHOT + + + + org.apache.pulsar + pulsar-functions-api + 2.10.0 + + + + + + + maven-assembly-plugin + + false + + jar-with-dependencies + + + + org.example.test.ExclamationFunction + + + + + + make-assembly + package + + assembly + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 17 + + + + + + + + ``` + +2. Package your Java function. + + ```bash + + mvn package + + ``` + + After the Java function is packaged, a `target` directory is created automatically. Open the `target` directory to check if there is a JAR package similar to `java-function-1.0-SNAPSHOT.jar`. + +3. Copy the packaged jar file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + +4. Run the Java function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname org.example.test.ExclamationFunction \ + --jar java-function-1.0-SNAPSHOT.jar \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name JavaFunction + + ``` + + The following log indicates that the Java function starts successfully. + + ```text + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + ``` diff --git a/site2/docs/functions-package-python.md b/site2/docs/functions-package-python.md new file mode 100644 index 0000000000000..290798480c9a1 --- /dev/null +++ b/site2/docs/functions-package-python.md @@ -0,0 +1,203 @@ +--- +id: functions-package-python +title: Package Python Functions +sidebar_label: "Package Python Functions" +--- + +Python functions support the following three packaging formats: +- One Python file +- ZIP file +- PIP + +## One Python file + +To package a Python function into **one Python file**, complete the following steps. + +1. Write a Python function. + + ```python + + from pulsar import Function // import the Function module from Pulsar + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + return input + '!' + + ``` + + In this example, when you write a Python function, you need to inherit the Function class and implement the `process()` method. + + `process()` mainly has two parameters: + + - `input` represents your input. + + - `context` represents an interface exposed by the Pulsar Function. You can get the attributes in the Python function based on the provided context object. + +2. Install a Python client. The implementation of a Python function depends on the Python client. + + ```bash + + pip install pulsar-client==2.10.0 + + ``` + +3. Copy the Python function file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + +4. Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname . \ + --py \ + --inputs persistent://public/default/my-topic-1 \ + --output persistent://public/default/test-1 \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + … + + ``` + +## ZIP file + +To package a Python function into a **ZIP file**, complete the following steps. + +1. Prepare the ZIP file. + + ```text + + Assuming the zip file is named as `func.zip`, unzip the `func.zip` folder: + "func/src" + "func/requirements.txt" + "func/deps" + + ``` + + Take the [exclamation.zip](https://github.com/apache/pulsar/tree/master/tests/docker-images/latest-version-image/python-examples) file as an example. The internal structure of the example is as follows. + + ```text + + . + ├── deps + │ └── sh-1.12.14-py2.py3-none-any.whl + └── src + └── exclamation.py + + ``` + +2. Copy the ZIP file to the Pulsar image. + + ```bash + + docker exec -it [CONTAINER ID] /bin/bash + docker cp CONTAINER ID:/pulsar + + ``` + +3. Run the Python function using the following command. + + ```bash + + ./bin/pulsar-admin functions localrun \ + --classname exclamation \ + --py \ + --inputs persistent://public/default/in-topic \ + --output persistent://public/default/out-topic \ + --tenant public \ + --namespace default \ + --name PythonFunction + + ``` + + The following log indicates that the Python function starts successfully. + + ```text + + ... + 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully + ... + + ``` + +## PIP + +:::note + +The PIP method is only supported in Kubernetes runtime. + +::: + +To package a Python function with **PIP**, complete the following steps. + +1. Configure the `functions_worker.yml` file. + + ```text + + #### Kubernetes Runtime #### + installUserCodeDependencies: true + + ``` + +2. Write your Python Function. + + ```python + + from pulsar import Function + import js2xml + + # The classic ExclamationFunction that appends an exclamation at the end + # of the input + class ExclamationFunction(Function): + def __init__(self): + pass + + def process(self, input, context): + // add your logic + return input + '!' + + ``` + + You can introduce additional dependencies. When Python functions detect that the file currently used is `whl` and the `installUserCodeDependencies` parameter is specified, the system uses the `pip install` command to install the dependencies required in Python functions. + +3. Generate the `whl` file. + + ```shell + + $ cd $PULSAR_HOME/pulsar-functions/scripts/python + $ chmod +x generate.sh + $ ./generate.sh + # e.g: ./generate.sh /path/to/python /path/to/python/output 1.0.0 + + ``` + + The output is written in `/path/to/python/output`: + + ```text + + -rw-r--r-- 1 root staff 1.8K 8 27 14:29 pulsarfunction-1.0.0-py2-none-any.whl + -rw-r--r-- 1 root staff 1.4K 8 27 14:29 pulsarfunction-1.0.0.tar.gz + -rw-r--r-- 1 root staff 0B 8 27 14:29 pulsarfunction.whl + + ``` diff --git a/site2/docs/functions-package.md b/site2/docs/functions-package.md index e529f19992148..2d68ad902f784 100644 --- a/site2/docs/functions-package.md +++ b/site2/docs/functions-package.md @@ -1,497 +1,19 @@ --- id: functions-package title: Package Pulsar Functions -sidebar_label: "How-to: Package" +sidebar_label: "How to package" --- -You can package Pulsar functions in Java, Python, and Go. Packaging the window function in Java is the same as [packaging a function in Java](#java). - -:::note - -Currently, the window function is not available in Python and Go. - -::: +If you want to submit and run functions in cluster mode, you need to package your functions first. ## Prerequisite -Before running a Pulsar function, you need to start Pulsar. You can [run a standalone Pulsar in Docker](getting-started-docker.md), or [run Pulsar in Kubernetes](getting-started-helm). - -To check whether the Docker image starts, you can use the `docker ps` command. - -## Java - -:::note - -For the runtime Java version, please refer to [Pulsar Runtime Java Version Recommendation](https://github.com/apache/pulsar/blob/master/README.md#pulsar-runtime-java-version-recommendation) according to your target Pulsar version. - -::: - -To package a function in Java, complete the following steps. - -1. Create a new maven project with a pom file. In the following code sample, the value of `mainClass` is your package name. - - ```java - - - - 4.0.0 - - java-function - java-function - 1.0-SNAPSHOT - - - - org.apache.pulsar - pulsar-functions-api - 2.6.0 - - - - - - - maven-assembly-plugin - - false - - jar-with-dependencies - - - - org.example.test.ExclamationFunction - - - - - - make-assembly - package - - assembly - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - 17 - - - - - - - - ``` - -2. Write a Java function. - - ``` - - package org.example.test; - - import java.util.function.Function; - - public class ExclamationFunction implements Function { - @Override - public String apply(String s) { - return "This is my function!"; - } - } - - ``` - - For the imported package, you can use one of the following interfaces: - - Function interface provided by Java 8: `java.util.function.Function` - - Pulsar Function interface: `org.apache.pulsar.functions.api.Function` - - The main difference between the two interfaces is that the `org.apache.pulsar.functions.api.Function` interface provides the context interface. When you write a function and want to interact with it, you can use context to obtain a wide variety of information and functionality for Pulsar Functions. - - The following example uses `org.apache.pulsar.functions.api.Function` interface with context. - - ``` - - package org.example.functions; - import org.apache.pulsar.functions.api.Context; - import org.apache.pulsar.functions.api.Function; - - import java.util.Arrays; - public class WordCountFunction implements Function { - // This function is invoked every time a message is published to the input topic - @Override - public Void process(String input, Context context) throws Exception { - Arrays.asList(input.split(" ")).forEach(word -> { - String counterKey = word.toLowerCase(); - context.incrCounter(counterKey, 1); - }); - return null; - } - } - - ``` - -3. Package the Java function. - - ```bash - - mvn package - - ``` - - After the Java function is packaged, a `target` directory is created automatically. Open the `target` directory to check if there is a JAR package similar to `java-function-1.0-SNAPSHOT.jar`. - - -4. Run the Java function. - - (1) Copy the packaged jar file to the Pulsar image. - - ```bash - - docker exec -it [CONTAINER ID] /bin/bash - docker cp CONTAINER ID:/pulsar - - ``` - - (2) Run the Java function using the following command. - - ```bash - - ./bin/pulsar-admin functions localrun \ - --classname org.example.test.ExclamationFunction \ - --jar java-function-1.0-SNAPSHOT.jar \ - --inputs persistent://public/default/my-topic-1 \ - --output persistent://public/default/test-1 \ - --tenant public \ - --namespace default \ - --name JavaFunction - - ``` - - The following log indicates that the Java function starts successfully. - - ```text - - ... - 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully - ... - - ``` - -## Python - -Python Function supports the following three formats: - -- One python file -- ZIP file -- PIP - -### One python file - -To package a function with **one python file** in Python, complete the following steps. - -1. Write a Python function. - - ``` - - from pulsar import Function // import the Function module from Pulsar - - # The classic ExclamationFunction that appends an exclamation at the end - # of the input - class ExclamationFunction(Function): - def __init__(self): - pass - - def process(self, input, context): - return input + '!' - - ``` - - In this example, when you write a Python function, you need to inherit the Function class and implement the `process()` method. - - `process()` mainly has two parameters: - - - `input` represents your input. - - - `context` represents an interface exposed by the Pulsar Function. You can get the attributes in the Python function based on the provided context object. - -2. Install a Python client. - - The implementation of a Python function depends on the Python client, so before deploying a Python function, you need to install the corresponding version of the Python client. - - ```bash - - pip install pulsar-client==2.6.0 - - ``` - -3. Run the Python Function. - - (1) Copy the Python function file to the Pulsar image. - - ```bash - - docker exec -it [CONTAINER ID] /bin/bash - docker cp CONTAINER ID:/pulsar - - ``` - - (2) Run the Python function using the following command. - - ```bash - - ./bin/pulsar-admin functions localrun \ - --classname . \ - --py \ - --inputs persistent://public/default/my-topic-1 \ - --output persistent://public/default/test-1 \ - --tenant public \ - --namespace default \ - --name PythonFunction - - ``` - - The following log indicates that the Python function starts successfully. - - ```text - - ... - 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully - ... - - ``` - -### ZIP file - -To package a function with the **ZIP file** in Python, complete the following steps. - -1. Prepare the ZIP file. - - The following is required when packaging the ZIP file of the Python Function. - - ```text - - Assuming the zip file is named as `func.zip`, unzip the `func.zip` folder: - "func/src" - "func/requirements.txt" - "func/deps" - - ``` - - Take [exclamation.zip](https://github.com/apache/pulsar/tree/master/tests/docker-images/latest-version-image/python-examples) as an example. The internal structure of the example is as follows. - - ```text - - . - ├── deps - │   └── sh-1.12.14-py2.py3-none-any.whl - └── src - └── exclamation.py - - ``` - -2. Run the Python Function. - - (1) Copy the ZIP file to the Pulsar image. - - ```bash - - docker exec -it [CONTAINER ID] /bin/bash - docker cp CONTAINER ID:/pulsar - - ``` - - (2) Run the Python function using the following command. - - ```bash - - ./bin/pulsar-admin functions localrun \ - --classname exclamation \ - --py \ - --inputs persistent://public/default/in-topic \ - --output persistent://public/default/out-topic \ - --tenant public \ - --namespace default \ - --name PythonFunction - - ``` - - The following log indicates that the Python function starts successfully. - - ```text - - ... - 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully - ... - - ``` - -### PIP - -The PIP method is only supported in Kubernetes runtime. To package a function with **PIP** in Python, complete the following steps. - -1. Configure the `functions_worker.yml` file. - - ```text - - #### Kubernetes Runtime #### - installUserCodeDependencies: true - - ``` - -2. Write your Python Function. - - ``` - - from pulsar import Function - import js2xml - - # The classic ExclamationFunction that appends an exclamation at the end - # of the input - class ExclamationFunction(Function): - def __init__(self): - pass - - def process(self, input, context): - // add your logic - return input + '!' - - ``` - - You can introduce additional dependencies. When Python Function detects that the file currently used is `whl` and the `installUserCodeDependencies` parameter is specified, the system uses the `pip install` command to install the dependencies required in Python Function. - -3. Generate the `whl` file. - - ```shell script - - $ cd $PULSAR_HOME/pulsar-functions/scripts/python - $ chmod +x generate.sh - $ ./generate.sh - # e.g: ./generate.sh /path/to/python /path/to/python/output 1.0.0 - - ``` - - The output is written in `/path/to/python/output`: - - ```text - - -rw-r--r-- 1 root staff 1.8K 8 27 14:29 pulsarfunction-1.0.0-py2-none-any.whl - -rw-r--r-- 1 root staff 1.4K 8 27 14:29 pulsarfunction-1.0.0.tar.gz - -rw-r--r-- 1 root staff 0B 8 27 14:29 pulsarfunction.whl - - ``` - -## Go - -To package a function in Go, complete the following steps. - -1. Write a Go function. - - Currently, Go function can be **only** implemented using SDK and the interface of the function is exposed in the form of SDK. Before using the Go function, you need to import "github.com/apache/pulsar/pulsar-function-go/pf". - - ``` - - import ( - "context" - "fmt" - - "github.com/apache/pulsar/pulsar-function-go/pf" - ) - - func HandleRequest(ctx context.Context, input []byte) error { - fmt.Println(string(input) + "!") - return nil - } - - func main() { - pf.Start(HandleRequest) - } - - ``` - - You can use context to connect to the Go function. - - ``` - - if fc, ok := pf.FromContext(ctx); ok { - fmt.Printf("function ID is:%s, ", fc.GetFuncID()) - fmt.Printf("function version is:%s\n", fc.GetFuncVersion()) - } - - ``` - - When writing a Go function, remember that - - In `main()`, you **only** need to register the function name to `Start()`. **Only** one function name is received in `Start()`. - - Go function uses Go reflection, which is based on the received function name, to verify whether the parameter list and returned value list are correct. The parameter list and returned value list **must be** one of the following sample functions: - - ``` - - func () - func () error - func (input) error - func () (output, error) - func (input) (output, error) - func (context.Context) error - func (context.Context, input) error - func (context.Context) (output, error) - func (context.Context, input) (output, error) - - ``` - -2. Build the Go function. - - ``` - - go build .go - - ``` - -3. Run the Go Function. - - (1) Copy the Go function file to the Pulsar image. - - ```bash - - docker exec -it [CONTAINER ID] /bin/bash - docker cp CONTAINER ID:/pulsar - - ``` - - (2) Run the Go function with the following command. - - ``` - - ./bin/pulsar-admin functions localrun \ - --go [your go function path] - --inputs [input topics] \ - --output [output topic] \ - --tenant [default:public] \ - --namespace [default:default] \ - --name [custom unique go function name] - - ``` - - The following log indicates that the Go function starts successfully. - - ```text - - ... - 07:55:03.724 [main] INFO org.apache.pulsar.functions.runtime.ProcessRuntime - Started process successfully - ... - - ``` +Before running a Pulsar function, you need to start Pulsar. -## Start Functions in cluster mode -If you want to start a function in cluster mode, replace `localrun` with `create` in the commands above. The following log indicates that your function starts successfully. +You can [run a standalone Pulsar in Docker](getting-started-docker), or [run Pulsar in Kubernetes](getting-started-helm). To check whether the Docker image starts, you can use the `docker ps` command. - ```text - - "Created successfully" - - ``` +## Package functions by programming language -For information about parameters on `--classname`, `--jar`, `--py`, `--go`, `--inputs`, run the command `./bin/pulsar-admin functions` or see [here](/tools/pulsar-admin/). \ No newline at end of file +* [Package Java functions](functions-package-java.md) +* [Package Python functions](functions-package-python.md) +* [Package Go functions](functions-package-go.md) \ No newline at end of file diff --git a/site2/docs/functions-quickstart.md b/site2/docs/functions-quickstart.md index 722d5bca2f659..93d655468786a 100644 --- a/site2/docs/functions-quickstart.md +++ b/site2/docs/functions-quickstart.md @@ -1,5 +1,675 @@ --- id: functions-quickstart title: Getting started with Pulsar Functions -sidebar_label: "Getting started" ---- \ No newline at end of file +sidebar_label: "Get started" +--- + +This hands-on tutorial provides step-by-step instructions and examples on how to create and validate functions in a [standalone Pulsar](getting-started-standalone.md), including stateful functions and window functions. + +## Prerequisites + +- JDK 8+. For more details, refer to [Pulsar runtime Java version recommendation](https://github.com/apache/pulsar#pulsar-runtime-java-version-recommendation). +- Windows OS is not supported. + +## Start standalone Pulsar + +1. Start Pulsar locally. + + ```bash + + bin/pulsar standalone + + ``` + + All the components (including ZooKeeper, BookKeeper, broker, and so on) of a Pulsar service start in order. You can use the `bin/pulsar-admin brokers healthcheck` command to make sure the Pulsar service is up and running. + +2. Check the Pulsar binary protocol port. + + ```bash + + telnet localhost 6650 + + ``` + +3. Check the Pulsar Function cluster. + + ```bash + + bin/pulsar-admin functions-worker get-cluster + + ``` + + **Output** + + ```json + + [{"workerId":"c-standalone-fw-localhost-6750","workerHostname":"localhost","port":6750}] + + ``` + +4. Make sure a public tenant exists. + + ```bash + + bin/pulsar-admin tenants list + + ``` + + **Output** + + ```json + + "public" + + ``` + +5. Make sure a default namespace exists. + + ```bash + + bin/pulsar-admin namespaces list public + + ``` + + **Output** + + ```json + + "public/default" + + ``` + +6. Make sure the table service is enabled successfully. + + ```bash + + telnet localhost 4181 + + ``` + + **Output** + + ```text + + Trying ::1... + telnet: connect to address ::1: Connection refused + Trying 127.0.0.1... + Connected to localhost. + Escape character is '^]'. + + ``` + +## Start functions + +:::note + +Before starting functions, you need to [start Pulsar](#start-standalone-pulsar). + +::: + +1. Create a tenant and a namespace. + + ```bash + + bin/pulsar-admin tenants create test + bin/pulsar-admin namespaces create test/test-namespace + + ``` + +2. In the same terminal window as step 1, verify the tenant and the namespace. + + ```bash + + bin/pulsar-admin namespaces list test + + ``` + + **Output** + + This output shows that both tenant and namespace are created successfully. + + ```text + + "test/test-namespace" + + ``` + +3. In the same terminal window as step 1, create a function named `examples`. + + :::tip + + You can see both the `example-function-config.yaml` and `api-examples.jar` files under the `examples` folder of the Pulsar’s directory on your local machine. + + ::: + + ```bash + + bin/pulsar-admin functions create \ + --function-config-file examples/example-function-config.yaml \ + --jar examples/api-examples.jar + + ``` + + **Output** + + ```text + + Created Successfully + + ``` + +4. In the same terminal window as step 1, verify the function's configurations. + + ```bash + + bin/pulsar-admin functions get \ + --tenant test \ + --namespace test-namespace \ + --name example + + ``` + + **Output** + + ```text + + { + "tenant": "test", + "namespace": "test-namespace", + "name": "example", + "className": "org.apache.pulsar.functions.api.examples.ExclamationFunction", + "userConfig": "{\"PublishTopic\":\"test_result\"}", + "autoAck": true, + "parallelism": 1, + "source": { + "topicsToSerDeClassName": { + "test_src": "" + }, + "typeClassName": "java.lang.String" + }, + "sink": { + "topic": "test_result", + "typeClassName": "java.lang.String" + }, + "resources": {} + } + + ``` + +5. In the same terminal window as step 1, verify the function's status. + + ```bash + + bin/pulsar-admin functions status \ + --tenant test \ + --namespace test-namespace \ + --name example + + ``` + + **Output** + + `"running": true` shows that the function is running. + + ```text + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 0, + "numSuccessfullyProcessed" : 0, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.0, + "lastInvocationTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +6. In the same terminal window as step 1, subscribe to the **output topic** `test_result`. + + ```bash + + bin/pulsar-client consume -s test-sub -n 0 test_result + + ``` + +7. In a new terminal window, produce messages to the **input topic** `test_src`. + + ```bash + + bin/pulsar-client produce -m "test-messages-`date`" -n 10 test_src + + ``` + +8. In the same terminal window as step 1, the messages produced by the `example` function are returned. + + **Output** + + ```text + + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + + ``` + +## Start stateful functions + +The standalone mode of Pulsar enables BookKeeper table service for stateful functions. For more information, see [Configure state storage](functions-develop-state.md). + +The following example provides instructions to validate counter functions. + +:::note + +Before starting stateful functions, you need to [start Pulsar](#start-standalone-pulsar). + +::: + +1. Create a function named `word_count`. + + ```bash + + bin/pulsar-admin functions create \ + --function-config-file examples/example-function-config.yaml \ + --jar examples/api-examples.jar \ + --name word_count \ + --className org.apache.pulsar.functions.api.examples.WordCountFunction \ + --inputs test_wordcount_src \ + --output test_wordcount_dest + + ``` + + **Output** + + ```text + + Created Successfully + + ``` + +2. In the same terminal window as step 1, get the information of the `word_count` function. + + ```bash + + bin/pulsar-admin functions get \ + --tenant test \ + --namespace test-namespace \ + --name word_count + + ``` + + **Output** + + ```text + + { + "tenant": "test", + "namespace": "test-namespace", + "name": "word_count", + "className": "org.apache.pulsar.functions.api.examples.WordCountFunction", + "inputSpecs": { + "test_wordcount_src": { + "isRegexPattern": false + } + }, + "output": "test_wordcount_dest", + "processingGuarantees": "ATLEAST_ONCE", + "retainOrdering": false, + "userConfig": { + "PublishTopic": "test_result" + }, + "runtime": "JAVA", + "autoAck": true, + "parallelism": 1, + "resources": { + "cpu": 1.0, + "ram": 1073741824, + "disk": 10737418240 + }, + "cleanupSubscription": true + } + + ``` + +3. In the same terminal window as step 1, get the status of the `word_count` function. + + ```bash + + bin/pulsar-admin functions status \ + --tenant test \ + --namespace test-namespace\ + --name word_count + + ``` + + **Output** + + ```text + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 0, + "numSuccessfullyProcessed" : 0, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.0, + "lastInvocationTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +4. In the same terminal window as step 1, query the state table for the function with the key `hello`. This operation watches the changes associated with `hello`. + + ```bash + + bin/pulsar-admin functions querystate \ + --tenant test \ + --namespace test-namespace \ + --name word_count -k hello -w + + ``` + + :::tip + + For more information about the `pulsar-admin functions querystate options` command, including flags, descriptions, default values, and shorthands, see [Admin API](/tools/pulsar-admin/). + + ::: + + **Output** + + ```text + + key 'hello' doesn't exist. + key 'hello' doesn't exist. + key 'hello' doesn't exist. + ... + + ``` + +5. In a new terminal window, produce 10 messages with `hello` to the **input topic** `test_wordcount_src` using one of the following methods. The value of `hello` is updated to 10. + + * **Method 1** + + ```bash + + bin/pulsar-client produce -m "hello" -n 10 test_wordcount_src + + ``` + + * **Method 2** + + ```bash + + bin/pulsar-admin functions putstate \ + --tenant test \ + --namespace test-namespace \ + --name word_count hello-word \ + + ``` + + :::tip + + For more information about the `pulsar-admin functions putstate options` command, including flags, descriptions, default values, and shorthands, see [Admin API](/tools/pulsar-admin/). + + ::: + +6. In the same terminal window as step 1, check the result. + + The result shows that the **output topic** `test_wordcount_dest` receives the messages. + + **Output** + + ```json + + { + "key": "hello", + "numberValue": 10, + "version": 9 + } + + ``` + +7. In the terminal window as step 5, produce another 10 messages with `hello`. The value of `hello` is updated to 20. + + ```bash + + bin/pulsar-client produce -m "hello" -n 10 test_wordcount_src + + ``` + +8. In the same terminal window as step 1, check the result. + + The result shows that the **output topic** `test_wordcount_dest` receives the value of 20. + + ```text + + value = 10 + value = 20 + + ``` + +## Start window functions + +Window functions are a special form of Pulsar Functions. For more information, see [concepts](functions-concepts.md#window-function). + +:::note + +Before starting window functions, you need to [start Pulsar](#start-standalone-pulsar). + +::: + +1. Create a tenant and a namespace. + + ```bash + + bin/pulsar-admin tenants create test + bin/pulsar-admin namespaces create test/test-namespace + + ``` + +2. In the same terminal window as step 1, verify the tenant and the namespace. + + ```bash + + bin/pulsar-admin namespaces list test + + ``` + + **Output** + + This output shows that both tenant and namespace are created successfully. + + ```text + + "test/test-namespace" + + ``` + +3. In the same terminal window as step 1, create a function named `example`. + + :::tip + + You can see both `example-window-function-config.yaml` and `api-examples.jar` files under the `examples` folder of the Pulsar’s directory on your local machine. + + ::: + + ```bash + + bin/pulsar-admin functions create --function-config-file \ + examples/example-window-function-config.yaml \ + --jar examples/api-examples.jar + + ``` + + **Output** + + ```text + + Created Successfully + + ``` + +4. In the same terminal window as step 1, verify the function's configurations. + + ```bash + + bin/pulsar-admin functions get \ + --tenant test \ + --namespace test-namespace \ + --name example + + ``` + + **Output** + + ```text + + { + "tenant": "test", + "namespace": "test-namespace", + "name": "example", + "className": "org.apache.pulsar.functions.api.examples.ExclamationFunction", + "userConfig": "{\"PublishTopic\":\"test_result\"}", + "autoAck": true, + "parallelism": 1, + "source": { + "topicsToSerDeClassName": { + "test_src": "" + }, + "typeClassName": "java.lang.String" + }, + "sink": { + "topic": "test_result", + "typeClassName": "java.lang.String" + }, + "resources": {} + } + + ``` + +5. In the same terminal window as step 1, verify the function’s status. + + ```bash + + bin/pulsar-admin functions status \ + --tenant test \ + --namespace test-namespace \ + --name example + + ``` + + **Output** + + `"running": true` shows that the function is running. + + ```text + + { + "numInstances" : 1, + "numRunning" : 1, + "instances" : [ { + "instanceId" : 0, + "status" : { + "running" : true, + "error" : "", + "numRestarts" : 0, + "numReceived" : 0, + "numSuccessfullyProcessed" : 0, + "numUserExceptions" : 0, + "latestUserExceptions" : [ ], + "numSystemExceptions" : 0, + "latestSystemExceptions" : [ ], + "averageLatency" : 0.0, + "lastInvocationTime" : 0, + "workerId" : "c-standalone-fw-localhost-8080" + } + } ] + } + + ``` + +6. In the same terminal window as step 1, subscribe to the **output topic** `test_result`. + + ```bash + + bin/pulsar-client consume -s test-sub -n 0 test_result + + ``` + +7. In a new terminal window, produce messages to the **input topic** `test_src`. + + ```bash + + bin/pulsar-client produce -m "test-messages-`date`" -n 10 test_src + + ``` + +8. In the same terminal window as step 1, the messages produced by the window function `example` are returned. + + **Output** + + ```text + + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + ----- got message ----- + test-messages-Thu Jul 19 11:59:15 PDT 2021! + + ``` \ No newline at end of file diff --git a/site2/docs/functions-runtime-java-options.md b/site2/docs/functions-runtime-java-options.md new file mode 100644 index 0000000000000..efc4f2a466641 --- /dev/null +++ b/site2/docs/functions-runtime-java-options.md @@ -0,0 +1,22 @@ +--- +id: functions-runtime-java-options +title: Customize Java runtime options +sidebar_label: "Customize Java runtime options" +--- + +:::note + +This setting **only** applies to process runtime and Kubernetes runtime. + +::: + +To pass additional arguments to the JVM command line for every process started by a function worker, you can configure the `additionalJavaRuntimeArguments` in the `conf/functions_worker.yml` file as follows. +- Add JMV flags, like `-XX:+ExitOnOutOfMemoryError` +- Pass custom system properties, like `-Dlog4j2.formatMsgNoLookups` + +```yaml + +additionalJavaRuntimeArguments: ['-XX:+ExitOnOutOfMemoryError','-Dfoo=bar'] + +``` + diff --git a/site2/docs/functions-runtime-kubernetes.md b/site2/docs/functions-runtime-kubernetes.md new file mode 100644 index 0000000000000..1a0ddd0fe44f3 --- /dev/null +++ b/site2/docs/functions-runtime-kubernetes.md @@ -0,0 +1,126 @@ +--- +id: functions-runtime-kubernetes +title: Configure Kubernetes runtime +sidebar_label: "Configure Kubernetes runtime" +--- + +The Kubernetes runtime works when a function worker generates and applies Kubernetes manifests. The manifests generated by a function worker include: +* a `StatefulSet` + + By default, the `StatefulSet` manifest has a single pod with a number of replicas. The number is determined by the [parallelism](functions-deploy-cluster-parallelism.md) of the function. The pod downloads the function payload (via the function worker REST API) on pod boot. The pod's container image is configurable if the function runtime is configured. +* a `Service` (used to communicate with the pod) +* a `Secret` for authenticating credentials (when applicable). + The Kubernetes runtime supports secrets. You can create a Kubernetes secret and expose it as an environment variable in the pod. + +:::tip + +For the rules of translating Pulsar object names into Kubernetes resource labels, see [instructions](admin-api-overview.md#how-to-define-pulsar-resource-names-when-running-pulsar-in-kubernetes). + +::: + +### Configure basic settings + +To quickly configure a Kubernetes runtime, you can use the default settings of [`KubernetesRuntimeFactoryConfig`](https://github.com/apache/pulsar/blob/master/pulsar-functions/runtime/src/main/java/org/apache/pulsar/functions/runtime/kubernetes/KubernetesRuntimeFactoryConfig.java) in the `conf/functions_worker.yml` file. + +If you have [set up a Pulsar cluster on Kubernetes using [Helm chart](helm-install.md), which means function workers have also been set up on Kubernetes, you can use the `serviceAccount` associated with the pod where the function worker is running. Otherwise, you can configure function workers to communicate with a Kubernetes cluster by setting `functionRuntimeFactoryConfigs` to `k8Uri`. + +### Integrate Kubernetes secrets + +A [Secret](https://kubernetes.io/docs/concepts/configuration/secret/) in Kubernetes is an object that holds some confidential data such as a password, a token, or a key. When you create a secret in the Kubernetes namespace where your functions are deployed, functions can safely reference and distribute it. To enable this feature, set `secretsProviderConfiguratorClassName` to `org.apache.pulsar.functions.secretsproviderconfigurator.KubernetesSecretsProviderConfigurator` in the `conf/functions-worker.yml` file. + +For example, you [deploy a function](functions-deploy.md) to the `pulsar-func` Kubernetes namespace, and you have a secret named `database-creds` with a field name `password`, which you want to mount in the pod as an environment variable named `DATABASE_PASSWORD`. The following configurations enable functions to reference the secret and mount the value as an environment variable in the pod. + +```yaml + +tenant: "mytenant" +namespace: "mynamespace" +name: "myfunction" +inputs: [ "persistent://mytenant/mynamespace/myfuncinput" ] +className: "com.company.pulsar.myfunction" + +secrets: + # the secret will be mounted from the `password` field in the `database-creds` secret as an env var called `DATABASE_PASSWORD` + DATABASE_PASSWORD: + path: "database-creds" + key: "password" + +``` + +### Enable token authentication + +When you use token authentication, TLS encryption, or custom authentications to secure the communication with your Pulsar cluster, Pulsar passes your certificate authority (CA) to the client, so the client can authenticate the cluster with your signed certificate. + +To enable the authentication for your Pulsar cluster, you need to specify a mechanism for the pod running your function to authenticate the broker, by implementing the `org.apache.pulsar.functions.auth.KubernetesFunctionAuthProvider` interface. + +* For token authentication, Pulsar includes an implementation of the above interface to distribute the CA. The function worker captures the token that deploys (or updates) the function, saves it as a secret, and mounts it into the pod. + + The configuration in the `conf/function-worker.yml` file is as follows. `functionAuthProviderClassName` is used to specify the path to this implementation. + + ```yaml + + functionAuthProviderClassName: org.apache.pulsar.functions.auth.KubernetesSecretsTokenAuthProvider + + ``` + +* For TLS or custom authentication, you can either implement the `org.apache.pulsar.functions.auth.KubernetesFunctionAuthProvider` interface or use an alternative mechanism. + +:::note + +If the token you use to deploy the function has an expiration date, you may need to deploy the function again after it expires. + +::: + +### Customize Kubernetes runtime + +Customizing Kubernetes runtime allows you to customize Kubernetes resources created by the runtime, including how to generate manifests, how to pass authenticated data to pods, and how to integrate secrets. + +To customize Kubernetes runtime, you can set `runtimeCustomizerClassName` in the `conf/functions-worker.yml` file and use the fully qualified class name. + +The function API provides a flag named `customRuntimeOptions`, which is passed to the `org.apache.pulsar.functions.runtime.kubernetes.KubernetesManifestCustomizer` interface. To initialize `KubernetesManifestCustomizer`, you can set `runtimeCustomizerConfig` in the `conf/functions-worker.yml` file. + +:::note + +`runtimeCustomizerConfig` is the same across all functions. If you provide both `runtimeCustomizerConfig` and `customRuntimeOptions`, you need to decide how to manage these two configurations in your implementation of the `KubernetesManifestCustomizer` interface. + +::: + +Pulsar includes a built-in implementation initialized with `runtimeCustomizerConfig`. It enables you to pass a JSON document as `customRuntimeOptions` with certain properties to augment. To use this built-in implementation, set `runtimeCustomizerClassName` to `org.apache.pulsar.functions.runtime.kubernetes.BasicKubernetesManifestCustomizer`. + +If both `runtimeCustomizerConfig` and `customRuntimeOptions` are provided and have conflicts, `BasicKubernetesManifestCustomizer` uses `customRuntimeOptions` to override `runtimeCustomizerConfig`. + +Below is an example of configuring `customRuntimeOptions`. + +```json + +{ + "jobName": "jobname", // the k8s pod name to run this function instance + "jobNamespace": "namespace", // the k8s namespace to run this function in + "extractLabels": { // extra labels to attach to the statefulSet, service, and pods + "extraLabel": "value" + }, + "extraAnnotations": { // extra annotations to attach to the statefulSet, service, and pods + "extraAnnotation": "value" + }, + "nodeSelectorLabels": { // node selector labels to add on to the pod spec + "customLabel": "value" + }, + "tolerations": [ // tolerations to add to the pod spec + { + "key": "custom-key", + "value": "value", + "effect": "NoSchedule" + } + ], + "resourceRequirements": { // values for cpu and memory should be defined as described here: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container + "requests": { + "cpu": 1, + "memory": "4G" + }, + "limits": { + "cpu": 2, + "memory": "8G" + } + } +} + +``` diff --git a/site2/docs/functions-runtime-process.md b/site2/docs/functions-runtime-process.md new file mode 100644 index 0000000000000..2082c89f3a25b --- /dev/null +++ b/site2/docs/functions-runtime-process.md @@ -0,0 +1,24 @@ +--- +id: functions-runtime-process +title: Configure process runtime +sidebar_label: "Configure process runtime" +--- + +You can use the default configurations of process runtime in the `conf/functions_worker.yml` file. + +If you want to customize more parameters, refer to the following example. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.process.ProcessRuntimeFactory +functionRuntimeFactoryConfigs: + # the directory for storing the function logs + logDirectory: + # change the jar location only when you put the java instance jar in a different location + javaInstanceJarLocation: + # change the python instance location only when you put the python instance jar in a different location + pythonInstanceLocation: + # change the extra dependencies location: + extraFunctionDependenciesDir: + +``` diff --git a/site2/docs/functions-runtime-thread.md b/site2/docs/functions-runtime-thread.md new file mode 100644 index 0000000000000..dce8a7fbfa2e4 --- /dev/null +++ b/site2/docs/functions-runtime-thread.md @@ -0,0 +1,34 @@ +--- +id: functions-runtime-thread +title: Configure thread runtime +sidebar_label: "Configure thread runtime" +--- + +You can use the default configurations of thread runtime in the `conf/functions_worker.yml` file. If you want to customize parameters, such as thread group name, refer to the following example. + +```yaml + +functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.thread.ThreadRuntimeFactory +functionRuntimeFactoryConfigs: + threadGroupName: "Your Function Container Group" + +``` + +To set the client memory limit for thread runtime, you can configure `pulsarClientMemoryLimit`. + +```yaml + +functionRuntimeFactoryConfigs: +# pulsarClientMemoryLimit +# # the max memory in bytes the pulsar client can use +# absoluteValue: +# # the max memory the pulsar client can use as a percentage of max direct memory set for JVM +# percentOfMaxDirectMemory: + +``` + +:::note + +If `absoluteValue` and `percentOfMaxDirectMemory` are both set, the smaller value is used. + +::: \ No newline at end of file diff --git a/site2/docs/functions-runtime.md b/site2/docs/functions-runtime.md index 66e694a9e5e83..67739377e7d86 100644 --- a/site2/docs/functions-runtime.md +++ b/site2/docs/functions-runtime.md @@ -1,405 +1,21 @@ --- id: functions-runtime -title: Configure Functions runtime -sidebar_label: "Setup: Configure Functions runtime" +title: Configure function runtime +sidebar_label: "Configure function runtime" --- -You can use the following methods to run functions. +Pulsar supports three types of [function runtime](functions-concepts.md#function-runtime) with different costs and isolation guarantees to maximize deployment flexibility of your functions. -- *Thread*: Invoke functions threads in functions worker. -- *Process*: Invoke functions in processes forked by functions worker. -- *Kubernetes*: Submit functions as Kubernetes StatefulSets by functions worker. +The following table outlines the supported programming languages for each type of function runtime. -:::note - -Pulsar supports adding labels to the Kubernetes StatefulSets and services while launching functions, which facilitates selecting the target Kubernetes objects. - -::: - -The differences of the thread and process modes are: -- Thread mode: when a function runs in thread mode, it runs on the same Java virtual machine (JVM) with functions worker. -- Process mode: when a function runs in process mode, it runs on the same machine that functions worker runs. - -## Configure thread runtime -It is easy to configure *Thread* runtime. In most cases, you do not need to configure anything. You can customize the thread group name with the following settings: - -```yaml - -functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.thread.ThreadRuntimeFactory -functionRuntimeFactoryConfigs: - threadGroupName: "Your Function Container Group" - -``` - -*Thread* runtime is only supported in Java function. - -## Configure process runtime -When you enable *Process* runtime, you do not need to configure anything. - -```yaml - -functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.process.ProcessRuntimeFactory -functionRuntimeFactoryConfigs: - # the directory for storing the function logs - logDirectory: - # change the jar location only when you put the java instance jar in a different location - javaInstanceJarLocation: - # change the python instance location only when you put the python instance jar in a different location - pythonInstanceLocation: - # change the extra dependencies location: - extraFunctionDependenciesDir: - -``` - -*Process* runtime is supported in Java, Python, and Go functions. - -## Configure Kubernetes runtime - -When the functions worker generates Kubernetes manifests and apply the manifests, the Kubernetes runtime works. If you have run functions worker on Kubernetes, you can use the `serviceAccount` associated with the pod that the functions worker is running in. Otherwise, you can configure it to communicate with a Kubernetes cluster. - -The manifests, generated by the functions worker, include a `StatefulSet`, a `Service` (used to communicate with the pods), and a `Secret` for auth credentials (when applicable). The `StatefulSet` manifest (by default) has a single pod, with the number of replicas determined by the "parallelism" of the function. On pod boot, the pod downloads the function payload (via the functions worker REST API). The pod's container image is configurable, but must have the functions runtime. - -The Kubernetes runtime supports secrets, so you can create a Kubernetes secret and expose it as an environment variable in the pod. The Kubernetes runtime is extensible, you can implement classes and customize the way how to generate Kubernetes manifests, how to pass auth data to pods, and how to integrate secrets. - -:::tip - -For the rules of translating Pulsar object names into Kubernetes resource labels, see [here](admin-api-overview.md#how-to-define-pulsar-resource-names-when-running-pulsar-in-kubernetes). - -::: - -### Basic configuration - -It is easy to configure Kubernetes runtime. You can just uncomment the settings of `kubernetesContainerFactory` in the `functions_worker.yaml` file. The following is an example. - -```yaml - -functionRuntimeFactoryClassName: org.apache.pulsar.functions.runtime.kubernetes.KubernetesRuntimeFactory -functionRuntimeFactoryConfigs: - # uri to kubernetes cluster, leave it to empty and it will use the kubernetes settings in function worker - k8Uri: - # the kubernetes namespace to run the function instances. it is `default`, if this setting is left to be empty - jobNamespace: - # The Kubernetes pod name to run the function instances. It is set to - # `pf----` if this setting is left to be empty - jobName: - # the docker image to run function instance. by default it is `apachepulsar/pulsar` - pulsarDockerImageName: - # the docker image to run function instance according to different configurations provided by users. - # By default it is `apachepulsar/pulsar`. - # e.g: - # functionDockerImages: - # JAVA: JAVA_IMAGE_NAME - # PYTHON: PYTHON_IMAGE_NAME - # GO: GO_IMAGE_NAME - functionDockerImages: - # "The image pull policy for image used to run function instance. By default it is `IfNotPresent` - imagePullPolicy: IfNotPresent - # the root directory of pulsar home directory in `pulsarDockerImageName`. by default it is `/pulsar`. - # if you are using your own built image in `pulsarDockerImageName`, you need to set this setting accordingly - pulsarRootDir: - # The config admin CLI allows users to customize the configuration of the admin cli tool, such as: - # `/bin/pulsar-admin and /bin/pulsarctl`. By default it is `/bin/pulsar-admin`. If you want to use `pulsarctl` - # you need to set this setting accordingly - configAdminCLI: - # this setting only takes effects if `k8Uri` is set to null. if your function worker is running as a k8 pod, - # setting this to true is let function worker to submit functions to the same k8s cluster as function worker - # is running. setting this to false if your function worker is not running as a k8 pod. - submittingInsidePod: false - # setting the pulsar service url that pulsar function should use to connect to pulsar - # if it is not set, it will use the pulsar service url configured in worker service - pulsarServiceUrl: - # setting the pulsar admin url that pulsar function should use to connect to pulsar - # if it is not set, it will use the pulsar admin url configured in worker service - pulsarAdminUrl: - # The flag indicates to install user code dependencies. (applied to python package) - installUserCodeDependencies: - # The repository that pulsar functions use to download python dependencies - pythonDependencyRepository: - # The repository that pulsar functions use to download extra python dependencies - pythonExtraDependencyRepository: - # the custom labels that function worker uses to select the nodes for pods - customLabels: - # The expected metrics collection interval, in seconds - expectedMetricsCollectionInterval: 30 - # Kubernetes Runtime will periodically checkback on - # this configMap if defined and if there are any changes - # to the kubernetes specific stuff, we apply those changes - changeConfigMap: - # The namespace for storing change config map - changeConfigMapNamespace: - # The ratio cpu request and cpu limit to be set for a function/source/sink. - # The formula for cpu request is cpuRequest = userRequestCpu / cpuOverCommitRatio - cpuOverCommitRatio: 1.0 - # The ratio memory request and memory limit to be set for a function/source/sink. - # The formula for memory request is memoryRequest = userRequestMemory / memoryOverCommitRatio - memoryOverCommitRatio: 1.0 - # The port inside the function pod which is used by the worker to communicate with the pod - grpcPort: 9093 - # The port inside the function pod on which prometheus metrics are exposed - metricsPort: 9094 - # The directory inside the function pod where nar packages will be extracted - narExtractionDirectory: - # The classpath where function instance files stored - functionInstanceClassPath: - # Upload the builtin sources/sinks to BookKeeper. - # True by default. - uploadBuiltinSinksSources: true - # the directory for dropping extra function dependencies - # if it is not an absolute path, it is relative to `pulsarRootDir` - extraFunctionDependenciesDir: - # Additional memory padding added on top of the memory requested by the function per on a per instance basis - percentMemoryPadding: 10 - # The duration (in seconds) before the StatefulSet is deleted after a function stops or restarts. - # Value must be a non-negative integer. 0 indicates the StatefulSet is deleted immediately. - # Default is 5 seconds. - gracePeriodSeconds: 5 - -``` - -If you run functions worker embedded in a broker on Kubernetes, you can use the default settings. - -### Run standalone functions worker on Kubernetes - -If you run functions worker standalone (that is, not embedded) on Kubernetes, you need to configure `pulsarSerivceUrl` to be the URL of the broker and `pulsarAdminUrl` as the URL to the functions worker. - -For example, both Pulsar brokers and Function Workers run in the `pulsar` K8S namespace. The brokers have a service called `brokers` and the functions worker has a service called `func-worker`. The settings are as follows: - -```yaml - -pulsarServiceUrl: pulsar://broker.pulsar:6650 // or pulsar+ssl://broker.pulsar:6651 if using TLS -pulsarAdminUrl: http://func-worker.pulsar:8080 // or https://func-worker:8443 if using TLS - -``` - -### Run RBAC in Kubernetes clusters - -If you run RBAC in your Kubernetes cluster, make sure that the service account you use for running functions workers (or brokers, if functions workers run along with brokers) have permissions on the following Kubernetes APIs. - -- services -- configmaps -- pods -- apps.statefulsets - -The following is sufficient: - -```yaml - -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRole -metadata: - name: functions-worker -rules: -- apiGroups: [""] - resources: - - services - - configmaps - - pods - verbs: - - '*' -- apiGroups: - - apps - resources: - - statefulsets - verbs: - - '*' ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: functions-worker ---- -apiVersion: rbac.authorization.k8s.io/v1beta1 -kind: ClusterRoleBinding -metadata: - name: functions-worker -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: functions-worker -subjectsKubernetesSec: -- kind: ServiceAccount - name: functions-worker - -``` - -If the service-account is not properly configured, an error message similar to this is displayed: - -```bash - -22:04:27.696 [Timer-0] ERROR org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory - Error while trying to fetch configmap example-pulsar-4qvmb5gur3c6fc9dih0x1xn8b-function-worker-config at namespace pulsar -io.kubernetes.client.ApiException: Forbidden - at io.kubernetes.client.ApiClient.handleResponse(ApiClient.java:882) ~[io.kubernetes-client-java-2.0.0.jar:?] - at io.kubernetes.client.ApiClient.execute(ApiClient.java:798) ~[io.kubernetes-client-java-2.0.0.jar:?] - at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMapWithHttpInfo(CoreV1Api.java:23673) ~[io.kubernetes-client-java-api-2.0.0.jar:?] - at io.kubernetes.client.apis.CoreV1Api.readNamespacedConfigMap(CoreV1Api.java:23655) ~[io.kubernetes-client-java-api-2.0.0.jar:?] - at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory.fetchConfigMap(KubernetesRuntimeFactory.java:284) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] - at org.apache.pulsar.functions.runtime.KubernetesRuntimeFactory$1.run(KubernetesRuntimeFactory.java:275) [org.apache.pulsar-pulsar-functions-runtime-2.4.0-42c3bf949.jar:2.4.0-42c3bf949] - at java.util.TimerThread.mainLoop(Timer.java:555) [?:1.8.0_212] - at java.util.TimerThread.run(Timer.java:505) [?:1.8.0_212] - -``` - -### Integrate Kubernetes secrets - -In order to safely distribute secrets, Pulasr Functions can reference Kubernetes secrets. To enable this, set the `secretsProviderConfiguratorClassName` to `org.apache.pulsar.functions.secretsproviderconfigurator.KubernetesSecretsProviderConfigurator`. - -You can create a secret in the namespace where your functions are deployed. For example, you deploy functions to the `pulsar-func` Kubernetes namespace, and you have a secret named `database-creds` with a field name `password`, which you want to mount in the pod as an environment variable called `DATABASE_PASSWORD`. The following functions configuration enables you to reference that secret and mount the value as an environment variable in the pod. - -```yaml - -tenant: "mytenant" -namespace: "mynamespace" -name: "myfunction" -inputs: [ "persistent://mytenant/mynamespace/myfuncinput" ] -className: "com.company.pulsar.myfunction" - -secrets: - # the secret will be mounted from the `password` field in the `database-creds` secret as an env var called `DATABASE_PASSWORD` - DATABASE_PASSWORD: - path: "database-creds" - key: "password" - -``` - -### Enable token authentication - -When you enable authentication for your Pulsar cluster, you need a mechanism for the pod running your function to authenticate with the broker. - -The `org.apache.pulsar.functions.auth.KubernetesFunctionAuthProvider` interface provides support for any authentication mechanism. The `functionAuthProviderClassName` in `function-worker.yml` is used to specify your path to this implementation. - -Pulsar includes an implementation of this interface for token authentication, and distributes the certificate authority via the same implementation. The configuration is similar as follows: - -```yaml - -functionAuthProviderClassName: org.apache.pulsar.functions.auth.KubernetesSecretsTokenAuthProvider - -``` - -For token authentication, the functions worker captures the token that is used to deploy (or update) the function. The token is saved as a secret and mounted into the pod. - -For custom authentication or TLS, you need to implement this interface or use an alternative mechanism to provide authentication. If you use token authentication and TLS encryption to secure the communication with the cluster, Pulsar passes your certificate authority (CA) to the client, so the client obtains what it needs to authenticate the cluster, and trusts the cluster with your signed certificate. +| Function runtime | Supported programming languages of functions | +|----------------------------------------------------|----------------------------------------------| +| [Thread runtime](functions-runtime-thread) | Java | +| [Process runtime](functions-runtime-process) | Java, Python, Go | +| [Kubernetes runtime](functions-runtime-kubernetes) | Java, Python, Go | :::note -If you use tokens that expire when deploying functions, these tokens will expire. +For the runtime Java version, refer to [Pulsar Runtime Java Version Recommendation](https://github.com/apache/pulsar/blob/master/README.md#pulsar-runtime-java-version-recommendation) according to your target Pulsar version. ::: - -### Run clusters with authentication - -When you run a functions worker in a standalone process (that is, not embedded in the broker) in a cluster with authentication, you must configure your functions worker to interact with the broker and authenticate incoming requests. So you need to configure properties that the broker requires for authentication or authorization. - -For example, if you use token authentication, you need to configure the following properties in the `function-worker.yml` file. - -```yaml - -clientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationToken -clientAuthenticationParameters: file:///etc/pulsar/token/admin-token.txt -configurationMetadataStoreUrl: zk:zookeeper-cluster:2181 # auth requires a connection to zookeeper -authenticationProviders: - - "org.apache.pulsar.broker.authentication.AuthenticationProviderToken" -authorizationEnabled: true -authenticationEnabled: true -superUserRoles: - - superuser - - proxy -properties: - tokenSecretKey: file:///etc/pulsar/jwt/secret # if using a secret token, key file must be DER-encoded - tokenPublicKey: file:///etc/pulsar/jwt/public.key # if using public/private key tokens, key file must be DER-encoded - -``` - -:::note - -You must configure both the Function Worker authorization or authentication for the server to authenticate requests and configure the client to be authenticated to communicate with the broker. - -::: - -### Customize Kubernetes runtime - -The Kubernetes integration enables you to implement a class and customize how to generate manifests. You can configure it by setting `runtimeCustomizerClassName` in the `functions-worker.yml` file and use the fully qualified class name. You must implement the `org.apache.pulsar.functions.runtime.kubernetes.KubernetesManifestCustomizer` interface. - -The functions (and sinks/sources) API provides a flag, `customRuntimeOptions`, which is passed to this interface. - -To initialize the `KubernetesManifestCustomizer`, you can provide `runtimeCustomizerConfig` in the `functions-worker.yml` file. `runtimeCustomizerConfig` is passed to the `public void initialize(Map config)` function of the interface. `runtimeCustomizerConfig`is different from the `customRuntimeOptions` as `runtimeCustomizerConfig` is the same across all functions. If you provide both `runtimeCustomizerConfig` and `customRuntimeOptions`, you need to decide how to manage these two configurations in your implementation of `KubernetesManifestCustomizer`. - -Pulsar includes a built-in implementation. To use the basic implementation, set `runtimeCustomizerClassName` to `org.apache.pulsar.functions.runtime.kubernetes.BasicKubernetesManifestCustomizer`. The built-in implementation initialized with `runtimeCustomizerConfig` enables you to pass a JSON document as `customRuntimeOptions` with certain properties to augment, which decides how the manifests are generated. If both `runtimeCustomizerConfig` and `customRuntimeOptions` are provided, `BasicKubernetesManifestCustomizer` uses `customRuntimeOptions` to override the configuration if there are conflicts in these two configurations. - -Below is an example of `customRuntimeOptions`. - -```json - -{ - "jobName": "jobname", // the k8s pod name to run this function instance - "jobNamespace": "namespace", // the k8s namespace to run this function in - "extractLabels": { // extra labels to attach to the statefulSet, service, and pods - "extraLabel": "value" - }, - "extraAnnotations": { // extra annotations to attach to the statefulSet, service, and pods - "extraAnnotation": "value" - }, - "nodeSelectorLabels": { // node selector labels to add on to the pod spec - "customLabel": "value" - }, - "tolerations": [ // tolerations to add to the pod spec - { - "key": "custom-key", - "value": "value", - "effect": "NoSchedule" - } - ], - "resourceRequirements": { // values for cpu and memory should be defined as described here: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container - "requests": { - "cpu": 1, - "memory": "4G" - }, - "limits": { - "cpu": 2, - "memory": "8G" - } - } -} - -``` - -## Run clusters with geo-replication - -If you run multiple clusters tied together with geo-replication, it is important to use a different function namespace for each cluster. Otherwise, the function shares a namespace and potentially schedule across clusters. - -For example, if you have two clusters: `east-1` and `west-1`, you can configure the functions workers for `east-1` and `west-1` perspectively as follows. - -```yaml - -pulsarFunctionsCluster: east-1 -pulsarFunctionsNamespace: public/functions-east-1 - -``` - -```yaml - -pulsarFunctionsCluster: west-1 -pulsarFunctionsNamespace: public/functions-west-1 - -``` - -This ensures the two different Functions Workers use distinct sets of topics for their internal coordination. - -## Configure standalone functions worker - -When configuring a standalone functions worker, you need to configure properties that the broker requires, especially if you use TLS. And then Functions Worker can communicate with the broker. - -You need to configure the following required properties. - -```yaml - -workerPort: 8080 -workerPortTls: 8443 # when using TLS -tlsCertificateFilePath: /etc/pulsar/tls/tls.crt # when using TLS -tlsKeyFilePath: /etc/pulsar/tls/tls.key # when using TLS -tlsTrustCertsFilePath: /etc/pulsar/tls/ca.crt # when using TLS -pulsarServiceUrl: pulsar://broker.pulsar:6650/ # or pulsar+ssl://pulsar-prod-broker.pulsar:6651/ when using TLS -pulsarWebServiceUrl: http://broker.pulsar:8080/ # or https://pulsar-prod-broker.pulsar:8443/ when using TLS -useTls: true # when using TLS, critical! - -``` - diff --git a/site2/docs/functions-worker-corun.md b/site2/docs/functions-worker-corun.md new file mode 100644 index 0000000000000..68027f231b66c --- /dev/null +++ b/site2/docs/functions-worker-corun.md @@ -0,0 +1,67 @@ +--- +id: functions-worker-corun +title: Run function workers with brokers +sidebar_label: "Run function workers with brokers" +--- + +The following diagram illustrates the deployment of function workers running along with brokers. + +![assets/functions-worker-corun.svg](/assets/function-workers-corun.svg) + +:::note + +The `Service URLs` in the illustration represent Pulsar service URLs that Pulsar client and Pulsar admin use to connect to a Pulsar cluster. + +::: + +To set up function workers to run with brokers, complete the following steps: +1. [Enable function workers](#enable-function-workers-to-run-with-brokers) +2. [Configure function workers](#configure-function-workers-to-run-with-brokers) +3. [Start function workers](#start-function-workers-to-run-with-brokers) + + +### Enable function workers to run with brokers + +In the `conf/broker.conf` file, set `functionsWorkerEnabled` to `true`. + +```conf + +functionsWorkerEnabled=true + +``` + +### Configure function workers to run with brokers + +In the `run-with-brokers` mode, most settings of function workers are inherited from your broker configuration (for example, configuration store settings, authentication settings, and so on). You can customize other worker settings by configuring the `conf/functions_worker.yml` file based on your needs. + +:::tip + +- To ensure high availability in a production deployment (a cluster with multiple brokers), set `numFunctionPackageReplicas` to equal the number of bookies. The default value `1` is only for one-node cluster deployment. +- To initialize distributed log metadata in runtime (`initializedDlogMetadata` = `true`), ensure that it has been initialized by the `bin/pulsar initialize-cluster-metadata` command. + +::: + +When authentication is enabled on the BookKeeper cluster, you need to configure the following authentication settings for your function workers. +- `bookkeeperClientAuthenticationPlugin`: the authentication plugin name of BookKeeper client. +- `bookkeeperClientAuthenticationParametersName`: the authentication plugin parameters of BookKeeper client, including names and values. +- `bookkeeperClientAuthenticationParameters`: the authentication plugin parameters of BookKeeper client. + +### Start function workers to run with brokers + +Once function workers are configured properly, you can start the brokers (function workers are running with the brokers). + +To verify whether each worker is running or not, you can use the following command. + +```bash + +curl :8080/admin/v2/worker/cluster + +``` + +If a list of active function workers is returned, it means they have been started successfully. The output is similar to the following. + +```json + +[{"workerId":"","workerHostname":"","port":8080}] + +``` \ No newline at end of file diff --git a/site2/docs/functions-worker-for-geo-replication.md b/site2/docs/functions-worker-for-geo-replication.md new file mode 100644 index 0000000000000..60a8d7b929dcb --- /dev/null +++ b/site2/docs/functions-worker-for-geo-replication.md @@ -0,0 +1,23 @@ +--- +id: functions-worker-for-geo-replication +title: Configure function workers for geo-replicated clusters +sidebar_label: "Configure function workers for geo-replicated clusters" +--- + +When running multiple clusters tied together with [geo replication](concepts-replication.md), you need to use a different function namespace for each cluster. Otherwise, all functions share one namespace and potentially schedule assignments across clusters. + +For example, if you have two clusters: `east-1` and `west-1`, you can configure the function workers for `east-1` and `west-1` respectively in the `conf/functions_worker.yml` file. This ensures the two different function workers use distinct sets of topics for their internal coordination. + +```yaml + +pulsarFunctionsCluster: east-1 +pulsarFunctionsNamespace: public/functions-east-1 + +``` + +```yaml + +pulsarFunctionsCluster: west-1 +pulsarFunctionsNamespace: public/functions-west-1 + +``` diff --git a/site2/docs/functions-worker-run-separately.md b/site2/docs/functions-worker-run-separately.md new file mode 100644 index 0000000000000..a75ceba274dfa --- /dev/null +++ b/site2/docs/functions-worker-run-separately.md @@ -0,0 +1,251 @@ +--- +id: functions-worker-run-separately +title: Run function workers separately +sidebar_label: "Run function workers separately" +--- + +The following diagram illustrates how function workers run as a separate process in separate machines. + +![assets/functions-worker-separated.svg](/assets/function-workers-separated.svg) + +:::note + +The `Service URLs` in the illustration represent Pulsar service URLs that Pulsar client and Pulsar admin use to connect to a Pulsar cluster. + +::: + +To set up function workers that run separately, complete the following steps: +1. [Configure function workers](#configure-function-workers-to-run-separately) +2. [Start function workers](#start-function-workers) +3. [Configure proxies for function workers](#configure-proxies-for-standalone-function-workers) + +## Configure function workers to run separately + +:::note + +To run function workers separately, you need to keep `functionsWorkerEnabled` as its default value (`false`) in the `conf/broker.conf` file. + +::: + +### Configure worker parameters + +Configure the required parameters for workers in the `conf/functions_worker.yml` file. +- `workerId`: The identity of a worker node, which is unique across clusters. The type is string. +- `workerHostname`: The hostname of the worker node. +- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. Set it to `null` to disable the plaintext port. +- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. For more information about TLS encryption settings, refer to [settings](#enable-tls-encryption). + +:::note + +When accessing function workers to manage functions, the `pulsar-admin` CLI or any of the clients should use the configured `workerHostname` and `workerPort` to generate an `--admin-url`. + +::: + +### Configure function package parameters + +Configure the `numFunctionPackageReplicas` parameter in the `conf/functions_worker.yml` file. It indicates the number of replicas to store function packages. + +:::note + +To ensure high availability in a production deployment, set `numFunctionPackageReplicas` to equal the number of bookies. The default value `1` is only for one-node cluster deployment. + +::: + +### Configure function metadata parameters + +Configure the required parameter for function metadata in the `conf/functions_worker.yml` file. +- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. +- `pulsarWebServiceUrl`: The Pulsar web service URL for your broker cluster. +- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the `conf/broker.conf` file). + +If authentication is enabled on your broker cluster, you must configure the following authentication settings for the function workers to communicate with the brokers. +- `brokerClientAuthenticationEnabled`: Whether to enable the broker client authentication used by function workers to talk to brokers. +- `clientAuthenticationPlugin`: The authentication plugin to be used by the Pulsar client used in worker service. +- `clientAuthenticationParameters`: The authentication parameter to be used by the Pulsar client used in worker service. + +### Enable security settings + +When you run a function worker separately in a cluster configured with authentication, your function worker needs to communicate with the broker and authenticate incoming requests. Thus you need to configure the properties that the broker requires for authentication and authorization. + +:::note + +You must configure both the function worker authentication and authorization for the server to authenticate incoming requests and configure the client to be authenticated to communicate with the broker. + +::: + +For example, if you use token authentication, you need to configure the following properties in the `conf/function-worker.yml` file. + +```yaml + +brokerClientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationToken +brokerClientAuthenticationParameters: file:///etc/pulsar/token/admin-token.txt +configurationMetadataStoreUrl: zk:zookeeper-cluster:2181 # auth requires a connection to zookeeper +authenticationProviders: + - "org.apache.pulsar.broker.authentication.AuthenticationProviderToken" +authorizationEnabled: true +authenticationEnabled: true +superUserRoles: + - superuser + - proxy +properties: + tokenSecretKey: file:///etc/pulsar/jwt/secret # if using a secret token, key file must be DER-encoded + tokenPublicKey: file:///etc/pulsar/jwt/public.key # if using public/private key tokens, key file must be DER-encoded + +``` + +You can enable the following security settings on function workers. +- [Enable TLS encryption](#enable-tls-transport-encryption) +- [Enable authentication providers](#enable-authentication-providers) +- [Enable authorization providers](#enable-authorization-providers) +- [Enable end-to-end encryption](functions-deploy-cluster-encryption.md) + + +#### Enable TLS encryption + +To enable TLS encryption, configure the following settings. + +```yaml + +useTLS: true +pulsarServiceUrl: pulsar+ssl://localhost:6651/ +pulsarWebServiceUrl: https://localhost:8443 + +tlsEnabled: true +tlsCertificateFilePath: /path/to/functions-worker.cert.pem +tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem +tlsTrustCertsFilePath: /path/to/ca.cert.pem + +// The path to trusted certificates used by the Pulsar client to authenticate with Pulsar brokers +brokerClientTrustCertsFilePath: /path/to/ca.cert.pem + +``` + +For more details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport.md). + + +#### Enable authentication providers + +To enable authentication providers on function workers, substitute the `authenticationProviders` parameter with the providers you want to enable. + +```properties + +authenticationEnabled: true +authenticationProviders: [provider1, provider2] + +``` + +For [TLS authentication](security-tls-authentication.md) provider, follow the example below to add the required settings. + +```properties + +brokerClientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationTls +brokerClientAuthenticationParameters: tlsCertFile:/path/to/admin.cert.pem,tlsKeyFile:/path/to/admin.key-pk8.pem + +authenticationEnabled: true +authenticationProviders: ['org.apache.pulsar.broker.authentication.AuthenticationProviderTls'] + +``` + +For SASL authentication provider, add `saslJaasClientAllowedIds` and `saslJaasServerSectionName` under `properties`. + +```properties + +properties: + saslJaasClientAllowedIds: .*pulsar.* + saslJaasServerSectionName: Broker + +``` + +For [token authentication](security-jwt.md) provider, add the required settings under `properties`. + +```properties + +properties: + tokenSecretKey: file://my/secret.key + # If using public/private + # tokenPublicKey: file://path/to/public.key + +``` + +:::note + +Key files must be DER (Distinguished Encoding Rules)-encoded. + +::: + +#### Enable authorization providers + +To enable authorization on function workers, complete the following steps. + +1. Configure `authorizationEnabled`, `authorizationProvider` and `configurationMetadataStoreUrl` in the `functions_worker.yml` file. The authentication provider connects to `configurationMetadataStoreUrl` to receive namespace policies. + + ```yaml + + authorizationEnabled: true + authorizationProvider: org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider + configurationMetadataStoreUrl: : + + ``` + +2. Configure a list of superuser roles. The superuser roles can access any admin API. The following configuration is an example. + + ```yaml + + superUserRoles: + - role1 + - role2 + - role3 + + ``` + +### Configure BookKeeper authentication + +If authentication is enabled on the BookKeeper cluster, you need to configure the following BookKeeper authentication settings for your function workers. +- `bookkeeperClientAuthenticationPlugin`: the authentication plugin name of BookKeeper client. +- `bookkeeperClientAuthenticationParametersName`: the authentication plugin parameters of BookKeeper client, including names and values. +- `bookkeeperClientAuthenticationParameters`: the authentication plugin parameters of BookKeeper client. + +## Start function workers + +:::note + +Before starting function workers, make sure [function runtime](functions-runtime.md) is configured. + +::: + +* You can start a function worker in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: + + ```bash + + bin/pulsar-daemon start functions-worker + + ``` + +* To start a function worker in the foreground, you can use the [`pulsar-admin`](/tools/pulsar-admin/) CLI as follows. + + ```bash + + bin/pulsar functions-worker + + ``` + +## Configure proxies for standalone function workers + +When you are running function workers in a separate cluster, the admin rest endpoints are split into two clusters as shown in the following figure. The `functions`, `function-worker`, `source`, and `sink` endpoints are now served by the worker cluster, while all the other remaining endpoints are served by the broker cluster. This requires you to use the right service URL accordingly in the `pulsar-admin` CLI. To address this inconvenience, you can start a proxy cluster that serves as the central entry point of the admin service for routing admin rest requests. + +![assets/functions-worker-separated-proxy.svg](/assets/function-workers-separated-with-proxy.svg) + +:::tip + +If you haven't set up a proxy cluster yet, follow the [instructions](administration-proxy.md) to deploy one. + +::: + +To enable a proxy for routing function-related admin requests to function workers, you can edit the `conf/proxy.conf` file to modify the following settings: + +```conf + +functionWorkerWebServiceURL= +functionWorkerWebServiceURLTLS= + +``` diff --git a/site2/docs/functions-worker-stateful.md b/site2/docs/functions-worker-stateful.md new file mode 100644 index 0000000000000..888c5cd1667b3 --- /dev/null +++ b/site2/docs/functions-worker-stateful.md @@ -0,0 +1,76 @@ +--- +id: functions-worker-stateful +title: Enable stateful functions +sidebar_label: "Enable stateful functions" +--- + +:::note + +When the stateful APIs of Pulsar Functions are required – for example, `putState()` and `queryState()` related interfaces – you need to enable the stateful function feature in function workers. + +::: + +1. Enable the `streamStorage` service in BookKeeper. + Currently, the service uses the NAR package, so you need to set the configuration in the `conf/bookkeeper.conf` file. + + ```text + + ################################################################## + ################################################################## + # Settings below are used by stream/table service + ################################################################## + ################################################################## + + ### Grpc Server ### + + # the grpc server port to listen on. default is 4181 + storageserver.grpc.port=4181 + + ### Dlog Settings for table service ### + + #### Replication Settings + dlog.bkcEnsembleSize=3 + dlog.bkcWriteQuorumSize=2 + dlog.bkcAckQuorumSize=2 + + ### Storage ### + + # local storage directories for storing table ranges data (e.g. rocksdb sst files) + storage.range.store.dirs=data/bookkeeper/ranges + + # whether the storage server capable of serving readonly tables. default is false. + storage.serve.readonly.tables=false + + # the cluster controller schedule interval, in milliseconds. default is 30 seconds. + storage.cluster.controller.schedule.interval.ms=30000 + + ``` + +2. After starting the bookie, use the following methods to check whether the `streamStorage` service has been started successfully. + + * Input: + + ```shell + + telnet localhost 4181 + + ``` + + * Output: + + ```text + + Trying 127.0.0.1... + Connected to localhost. + Escape character is '^]'. + + ``` + +3. Configure `stateStorageServiceUrl` in the `conf/functions_worker.yml` file. + `bk-service-url` is the service URL pointing to the BookKeeper table service. + + ```shell + + stateStorageServiceUrl: bk://:4181 + + ``` diff --git a/site2/docs/functions-worker-temp-file-path.md b/site2/docs/functions-worker-temp-file-path.md new file mode 100644 index 0000000000000..5bc43874c75ce --- /dev/null +++ b/site2/docs/functions-worker-temp-file-path.md @@ -0,0 +1,14 @@ +--- +id: functions-worker-tmp-file-path +title: Configure temporary file path +sidebar_label: "Configure temporary file path" +--- + +Function workers use `java.io.tmpdir` in the JVM as the default temporary file path, which is also used as the default extraction file path for each NAR package. NAR packages require a local file path to extract and load to the Java class loader. + +If you want to change the default extraction file path for NAR packages to another directory, you can add the following parameter with the desired directory in the `functions_worker.yml` file. The configuration varies depending on the [function runtime](functions-concepts.md#function-runtime) you are using. + +| Function runtime | Configuration for temporary file path | +|:------------------------|:-------------------------------------------------| +| [Thread runtime](functions-runtime-thread.md)
[Process runtime](functions-runtime-process) | `narExtractionDirectory` | +| [Kubernetes runtime](functions-runtime-kubernetes) | `functionRuntimeFactoryConfigs.narExtractionDirectory` | diff --git a/site2/docs/functions-worker-troubleshooting.md b/site2/docs/functions-worker-troubleshooting.md new file mode 100644 index 0000000000000..4e357d75c30a6 --- /dev/null +++ b/site2/docs/functions-worker-troubleshooting.md @@ -0,0 +1,45 @@ +--- +id: functions-worker-troubleshooting +title: Troubleshooting +sidebar_label: "Troubleshooting" +--- + +**Error message: Namespace missing local cluster name in clusters list** + +```text + +Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] + +``` + +The error message displays when any of the following cases occurs: +- a broker is started with `functionsWorkerEnabled=true`, but `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file is not set to the correct cluster. +- setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. + +**Workaround** + +If any of these cases happen, follow the instructions below to fix the problem. + +1. Disable function workers by setting `functionsWorkerEnabled=false`, and restart brokers. + +2. Get the current cluster list of the `public/functions` namespace. + + ```bash + + bin/pulsar-admin namespaces get-clusters public/functions + + ``` + +3. Check if the cluster is in the cluster list. If not, add it and update the list. + + ```bash + + bin/pulsar-admin namespaces set-clusters --clusters , public/functions + + ``` + +4. After setting the cluster successfully, enable function workers by setting `functionsWorkerEnabled=true`. + +5. Set the correct cluster name for the `pulsarFunctionsCluster` parameter in the `conf/functions_worker.yml` file. + +6. Restart brokers. diff --git a/site2/docs/functions-worker.md b/site2/docs/functions-worker.md index 45f5e075f83e2..c785c05cf3662 100644 --- a/site2/docs/functions-worker.md +++ b/site2/docs/functions-worker.md @@ -1,406 +1,19 @@ --- id: functions-worker -title: Deploy and manage functions worker -sidebar_label: "Setup: Pulsar Functions Worker" +title: Set up function workers +sidebar_label: "Set up function workers" --- -Before using Pulsar Functions, you need to learn how to set up Pulsar Functions worker and how to [configure Functions runtime](functions-runtime). -Pulsar `functions-worker` is a logic component to run Pulsar Functions in cluster mode. Two options are available, and you can select either based on your requirements. -- [run with brokers](#run-functions-worker-with-brokers) -- [run it separately](#run-functions-worker-separately) in a different broker +You have two ways to set up [function workers](function-concepts.md#function-worker). +- [Run function workers with brokers](functions-worker-corun.md). Use it when: + - resource isolation is not required when running functions in process or thread mode; + - you configure the function workers to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). +- [Run function workers separately](functions-worker-run-separately.md). Use it when you want to separate functions and brokers. -:::note +**Optional configurations** +* [Configure temporary file path](functions-worker-tmp-file-path.md) +* [Enable stateful functions](functions-worker-stateful.md) +* [Configure function workers for geo-replicated clusters](functions-worker-for-geo-replication.md) -The `--- Service Urls---` lines in the following diagrams represent Pulsar service URLs that Pulsar client and admin use to connect to a Pulsar cluster. - -::: - -## Run Functions-worker with brokers - -The following diagram illustrates the deployment of functions-workers running along with brokers. - -![assets/functions-worker-corun.png](/assets/functions-worker-corun.png) - -To enable functions-worker running as part of a broker, you need to set `functionsWorkerEnabled` to `true` in the `broker.conf` file. - -```conf - -functionsWorkerEnabled=true - -``` - -If the `functionsWorkerEnabled` is set to `true`, the functions-worker is started as part of a broker. You need to configure the `conf/functions_worker.yml` file to customize your functions_worker. - -Before you run Functions-worker with broker, you have to configure Functions-worker, and then start it with brokers. - -### Configure Functions-Worker to run with brokers -In this mode, most of the settings are already inherited from your broker configuration (for example, configurationStore settings, authentication settings, and so on) since `functions-worker` is running as part of the broker. - -Pay attention to the following required settings when configuring functions-worker in this mode. - -- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`, which is good for standalone deployment. For production deployment, to ensure high availability, set it to be larger than `2`. -- `initializedDlogMetadata`: Whether to initialize distributed log metadata in runtime. If it is set to `true`, you must ensure that it has been initialized by `bin/pulsar initialize-cluster-metadata` command. - -If authentication is enabled on the BookKeeper cluster, configure the following BookKeeper authentication settings. - -- `bookkeeperClientAuthenticationPlugin`: the BookKeeper client authentication plugin name. -- `bookkeeperClientAuthenticationParametersName`: the BookKeeper client authentication plugin parameters name. -- `bookkeeperClientAuthenticationParameters`: the BookKeeper client authentication plugin parameters. - -### Configure Stateful-Functions to run with broker - -If you want to use Stateful-Functions related functions (for example, `putState()` and `queryState()` related interfaces), follow steps below. - -1. Enable the **streamStorage** service in the BookKeeper. - - Currently, the service uses the NAR package, so you need to set the configuration in `bookkeeper.conf`. - - ```text - - extraServerComponents=org.apache.bookkeeper.stream.server.StreamStorageLifecycleComponent - - ``` - - After starting bookie, use the following methods to check whether the streamStorage service is started correctly. - - Input: - - ```shell - - telnet localhost 4181 - - ``` - - Output: - - ```text - - Trying 127.0.0.1... - Connected to localhost. - Escape character is '^]'. - - ``` - -2. Turn on this function in `functions_worker.yml`. - - ```text - - stateStorageServiceUrl: bk://:4181 - - ``` - - `bk-service-url` is the service URL pointing to the BookKeeper table service. - -### Start Functions-worker with broker - -Once you have configured the `functions_worker.yml` file, you can start or restart your broker. - -And then you can use the following command to verify if `functions-worker` is running well. - -```bash - -curl :8080/admin/v2/worker/cluster - -``` - -After entering the command above, a list of active function workers in the cluster is returned. The output is similar to the following. - -```json - -[{"workerId":"","workerHostname":"","port":8080}] - -``` - -## Run Functions-worker separately - -This section illustrates how to run `functions-worker` as a separate process in separate machines. - -![assets/functions-worker-separated.png](/assets/functions-worker-separated.png) - -:::note - -In this mode, make sure `functionsWorkerEnabled` is set to `false`, so you won't start `functions-worker` with brokers by mistake. Also, while accessing the `functions-worker` to manage any of the functions, the `pulsar-admin` CLI tool or any of the clients should use the `workerHostname` and `workerPort` that you set in [Worker parameters](#worker-parameters) to generate an `--admin-url`. - -::: - -### Configure Functions-worker to run separately - -To run function-worker separately, you have to configure the following parameters. - -#### Worker parameters - -- `workerId`: The type is string. It is unique across clusters, which is used to identify a worker machine. -- `workerHostname`: The hostname of the worker machine. -- `workerPort`: The port that the worker server listens on. Keep it as default if you don't customize it. Set it to `null` to disable the plaintext port. -- `workerPortTls`: The TLS port that the worker server listens on. Keep it as default if you don't customize it. - -#### Function package parameter - -- `numFunctionPackageReplicas`: The number of replicas to store function packages. The default value is `1`. - -#### Function metadata parameter - -- `pulsarServiceUrl`: The Pulsar service URL for your broker cluster. -- `pulsarWebServiceUrl`: The Pulsar web service URL for your broker cluster. -- `pulsarFunctionsCluster`: Set the value to your Pulsar cluster name (same as the `clusterName` setting in the broker configuration). - -If authentication is enabled for your broker cluster, you *should* configure the authentication plugin and parameters for the functions worker to communicate with the brokers. - -- `brokerClientAuthenticationEnabled`: Whether to enable the broker client authentication used by function workers to talk to brokers. -- `clientAuthenticationPlugin`: The authentication plugin to be used by the Pulsar client used in worker service. -- `clientAuthenticationParameters`: The authentication parameter to be used by the Pulsar client used in worker service. - -#### Customize Java runtime options - -If you want to pass additional arguments to the JVM command line to every process started by a function worker, -you can configure the `additionalJavaRuntimeArguments` parameter. - -``` - -additionalJavaRuntimeArguments: ['-XX:+ExitOnOutOfMemoryError','-Dfoo=bar'] - -``` - -This is very useful in case you want to: -- add JMV flags, like `-XX:+ExitOnOutOfMemoryError` -- pass custom system properties, like `-Dlog4j2.formatMsgNoLookups` - -:::note - -This feature applies only to Process and Kubernetes runtimes. - -::: - -#### Security settings - -If you want to enable security on functions workers, you *should*: -- [Enable TLS transport encryption](#enable-tls-transport-encryption) -- [Enable Authentication Provider](#enable-authentication-provider) -- [Enable Authorization Provider](#enable-authorization-provider) -- [Enable End-to-End Encryption](#enable-end-to-end-encryption) - -##### Enable TLS transport encryption - -To enable TLS transport encryption, configure the following settings. - -``` - -useTLS: true -pulsarServiceUrl: pulsar+ssl://localhost:6651/ -pulsarWebServiceUrl: https://localhost:8443 - -tlsEnabled: true -tlsCertificateFilePath: /path/to/functions-worker.cert.pem -tlsKeyFilePath: /path/to/functions-worker.key-pk8.pem -tlsTrustCertsFilePath: /path/to/ca.cert.pem - -// The path to trusted certificates used by the Pulsar client to authenticate with Pulsar brokers -brokerClientTrustCertsFilePath: /path/to/ca.cert.pem - -``` - -For details on TLS encryption, refer to [Transport Encryption using TLS](security-tls-transport). - -##### Enable Authentication Provider - -To enable authentication on Functions Worker, you need to configure the following settings. - -:::note - -Substitute the *providers list* with the providers you want to enable. - -::: - -``` - -authenticationEnabled: true -authenticationProviders: [ provider1, provider2 ] - -``` - -For *TLS Authentication* provider, follow the example below to add the necessary settings. -See [TLS Authentication](security-tls-authentication) for more details. - -``` - -brokerClientAuthenticationPlugin: org.apache.pulsar.client.impl.auth.AuthenticationTls -brokerClientAuthenticationParameters: tlsCertFile:/path/to/admin.cert.pem,tlsKeyFile:/path/to/admin.key-pk8.pem - -authenticationEnabled: true -authenticationProviders: ['org.apache.pulsar.broker.authentication.AuthenticationProviderTls'] - -``` - -For *SASL Authentication* provider, add `saslJaasClientAllowedIds` and `saslJaasServerSectionName` -under `properties` if needed. - -``` - -properties: - saslJaasClientAllowedIds: .*pulsar.* - saslJaasServerSectionName: Broker - -``` - -For *Token Authentication* provider, add necessary settings for `properties` if needed. -See [Token Authentication](security-jwt) for more details. -Note: key files must be DER-encoded - -``` - -properties: - tokenSecretKey: file://my/secret.key - # If using public/private - # tokenPublicKey: file:///path/to/public.key - -``` - -##### Enable Authorization Provider - -To enable authorization on Functions Worker, you need to configure `authorizationEnabled`, `authorizationProvider` and `configurationMetadataStoreUrl`. The authentication provider connects to `configurationMetadataStoreUrl` to receive namespace policies. - -```yaml - -authorizationEnabled: true -authorizationProvider: org.apache.pulsar.broker.authorization.PulsarAuthorizationProvider -configurationMetadataStoreUrl: : - -``` - -You should also configure a list of superuser roles. The superuser roles are able to access any admin API. The following is a configuration example. - -```yaml - -superUserRoles: - - role1 - - role2 - - role3 - -``` - -##### Enable End-to-End Encryption - -You can use the public and private key pair that the application configures to perform encryption. Only the consumers with a valid key can decrypt the encrypted messages. - -To enable End-to-End encryption on Functions Worker, you can set it by specifying `--producer-config` in the command line terminal, for more information, please refer to [here](security-encryption). - -We include the relevant configuration information of `CryptoConfig` into `ProducerConfig`. The specific configurable field information about `CryptoConfig` is as follows: - -```text - -public class CryptoConfig { - private String cryptoKeyReaderClassName; - private Map cryptoKeyReaderConfig; - - private String[] encryptionKeys; - private ProducerCryptoFailureAction producerCryptoFailureAction; - - private ConsumerCryptoFailureAction consumerCryptoFailureAction; -} - -``` - -- `producerCryptoFailureAction`: define the action if producer fail to encrypt data one of `FAIL`, `SEND`. -- `consumerCryptoFailureAction`: define the action if consumer fail to decrypt data one of `FAIL`, `DISCARD`, `CONSUME`. - -#### BookKeeper Authentication - -If authentication is enabled on the BookKeeper cluster, you need configure the BookKeeper authentication settings as follows: - -- `bookkeeperClientAuthenticationPlugin`: the plugin name of BookKeeper client authentication. -- `bookkeeperClientAuthenticationParametersName`: the plugin parameters name of BookKeeper client authentication. -- `bookkeeperClientAuthenticationParameters`: the plugin parameters of BookKeeper client authentication. - -### Start Functions-worker - -Once you have finished configuring the `functions_worker.yml` configuration file, you can start a `functions-worker` in the background by using [nohup](https://en.wikipedia.org/wiki/Nohup) with the [`pulsar-daemon`](reference-cli-tools.md#pulsar-daemon) CLI tool: - -```bash - -bin/pulsar-daemon start functions-worker - -``` - -You can also start `functions-worker` in the foreground by using `pulsar` CLI tool: - -```bash - -bin/pulsar functions-worker - -``` - -### Configure Proxies for Functions-workers - -When you are running `functions-worker` in a separate cluster, the admin rest endpoints are split into two clusters. `functions`, `function-worker`, `source` and `sink` endpoints are now served -by the `functions-worker` cluster, while all the other remaining endpoints are served by the broker cluster. -Hence you need to configure your `pulsar-admin` to use the right service URL accordingly. - -In order to address this inconvenience, you can start a proxy cluster for routing the admin rest requests accordingly. Hence you will have one central entry point for your admin service. - -If you already have a proxy cluster, continue reading. If you haven't setup a proxy cluster before, you can follow the [instructions](administration-proxy) to -start proxies. - -![assets/functions-worker-separated.png](/assets/functions-worker-separated-proxy.png) - -To enable routing functions related admin requests to `functions-worker` in a proxy, you can edit the `proxy.conf` file to modify the following settings: - -```conf - -functionWorkerWebServiceURL= -functionWorkerWebServiceURLTLS= - -``` - -## Compare the Run-with-Broker and Run-separately modes - -As described above, you can run Function-worker with brokers, or run it separately. And it is more convenient to run functions-workers along with brokers. However, running functions-workers in a separate cluster provides better resource isolation for running functions in `Process` or `Thread` mode. - -Use which mode for your cases, refer to the following guidelines to determine. - -Use the `Run-with-Broker` mode in the following cases: -- a) if resource isolation is not required when running functions in `Process` or `Thread` mode; -- b) if you configure the functions-worker to run functions on Kubernetes (where the resource isolation problem is addressed by Kubernetes). - -Use the `Run-separately` mode in the following cases: -- a) you don't have a Kubernetes cluster; -- b) if you want to run functions and brokers separately. - -## Troubleshooting - -**Error message: Namespace missing local cluster name in clusters list** - -``` - -Failed to get partitioned topic metadata: org.apache.pulsar.client.api.PulsarClientException$BrokerMetadataException: Namespace missing local cluster name in clusters list: local_cluster=xyz ns=public/functions clusters=[standalone] - -``` - -The error message prompts when either of the cases occurs: -- a) a broker is started with `functionsWorkerEnabled=true`, but the `pulsarFunctionsCluster` is not set to the correct cluster in the `conf/functions_worker.yaml` file; -- b) setting up a geo-replicated Pulsar cluster with `functionsWorkerEnabled=true`, while brokers in one cluster run well, brokers in the other cluster do not work well. - -**Workaround** - -If any of these cases happens, follow the instructions below to fix the problem: - -1. Disable Functions Worker by setting `functionsWorkerEnabled=false`, and restart brokers. - -2. Get the current clusters list of `public/functions` namespace. - -```bash - -bin/pulsar-admin namespaces get-clusters public/functions - -``` - -3. Check if the cluster is in the clusters list. If the cluster is not in the list, add it to the list and update the clusters list. - -```bash - -bin/pulsar-admin namespaces set-clusters --clusters , public/functions - -``` - -4. After setting the cluster successfully, enable functions worker by setting `functionsWorkerEnabled=true`. - -5. Set the correct cluster name in `pulsarFunctionsCluster` in the `conf/functions_worker.yml` file, and restart brokers. +**Reference** +* [Troubleshooting](functions-worker-troubleshooting.md) diff --git a/site2/docs/io-overview.md b/site2/docs/io-overview.md index 8f5bd433af614..04b096de70940 100644 --- a/site2/docs/io-overview.md +++ b/site2/docs/io-overview.md @@ -159,5 +159,5 @@ For more information about the options of `pulsar-admin sinks update`, see [here You can manage Pulsar connectors (for example, create, update, start, stop, restart, reload, delete and perform other operations on connectors) via the `Connector Admin CLI` with sources and sinks subcommands. For the latest and complete information, see [Pulsar admin docs](/tools/pulsar-admin/). -Connectors (sources and sinks) and Functions are components of instances, and they all run on Functions workers. When managing a source, sink or function via the `Connector Admin CLI` or [Functions Admin CLI](functions-cli), an instance is started on a worker. For more information, see [Functions worker](functions-worker.md#run-functions-worker-separately). +Connectors (sources and sinks) and Functions are components of instances, and they all run on Functions workers. When managing a source, sink or function via the `Connector Admin CLI` or `Functions Admin CLI`, an instance is started on a worker. For more information, see [Functions worker](functions-worker-run-separately.md). diff --git a/site2/docs/txn-what.md b/site2/docs/txn-what.md index e603f37f7ef93..9f7b6d527384a 100644 --- a/site2/docs/txn-what.md +++ b/site2/docs/txn-what.md @@ -4,7 +4,7 @@ title: What are transactions? sidebar_label: "What are transactions?" --- -Transactions strengthen the message delivery semantics of Apache Pulsar and [processing guarantees of Pulsar Functions](functions-overview.md#processing-guarantees). The Pulsar Transaction API supports atomic writes and acknowledgments across multiple topics. +Transactions strengthen the message delivery semantics of Apache Pulsar and [processing guarantees of Pulsar Functions](functions-concepts.md#processing-guarantees-and-subscription-types). The Pulsar Transaction API supports atomic writes and acknowledgments across multiple topics. Transactions allow: diff --git a/site2/website/sidebars.json b/site2/website/sidebars.json index 2961629a2fa3b..be4f3b0d796a9 100644 --- a/site2/website/sidebars.json +++ b/site2/website/sidebars.json @@ -44,12 +44,113 @@ "label": "Pulsar Functions", "items": [ "functions-overview", - "functions-runtime", - "functions-worker", - "functions-develop", - "functions-package", - "functions-debug", - "functions-deploy", + "functions-concepts", + "functions-quickstart", + { + "type": "category", + "label": "Set up function workers", + "link": { + "type": "doc", + "id": "functions-worker" + }, + "items": [ + "functions-worker-corun", + "functions-worker-run-separately", + "functions-worker-tmp-file-path", + "functions-worker-stateful", + "functions-worker-for-geo-replication", + "functions-worker-troubleshooting" + ] + }, + { + "type": "category", + "label": "Configure function runtime", + "link": { + "type": "doc", + "id": "functions-runtime" + }, + "items": [ + "functions-runtime-thread", + "functions-runtime-process", + "functions-runtime-kubernetes", + "functions-runtime-java-options" + ] + }, + { + "type": "category", + "label": "How-to: Deploy", + "link": { + "type": "doc", + "id": "functions-deploy" + }, + "items": [ + "functions-deploy-arguments", + "functions-deploy-localrun", + { + "type": "category", + "label": "Deploy a function in cluster mode", + "link": { + "type": "doc", + "id": "functions-deploy-cluster" + }, + "items": [ + "functions-deploy-cluster-resource", + "functions-deploy-cluster-parallelism", + "functions-deploy-cluster-encryption", + "functions-deploy-cluster-package" + ] + }, + "functions-deploy-trigger" + ] + }, + { + "type": "category", + "label": "How-to: Develop", + "link": { + "type": "doc", + "id": "functions-develop" + }, + "items": [ + "functions-develop-api", + "functions-develop-user-defined-configs", + "functions-develop-log", + "functions-develop-metrics", + "functions-develop-security", + "functions-develop-state", + "functions-develop-admin-api", + "functions-develop-schema-registry", + "functions-develop-serde", + "functions-develop-tutorial" + ] + }, + { + "type": "category", + "label": "How-to: Debug", + "link": { + "type": "doc", + "id": "functions-debug" + }, + "items": [ + "functions-debug-stderr", + "functions-debug-unit-test", + "functions-debug-localrun", + "functions-debug-log-topic", + "functions-debug-cli" + ] + }, + { + "type": "category", + "label": "How-to: Package", + "link": { + "type": "doc", + "id": "functions-package" + }, + "items": [ + "functions-package-java", + "functions-package-python", + "functions-package-go" + ] + }, "functions-cli", "window-functions-context" ]